This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new b155461f7 Loosen nullability restrictions added in #3205 (#3226)
(#3244)
b155461f7 is described below
commit b155461f770eb2ab8cc5d3296f6123582cf5073d
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Mon Dec 5 10:34:02 2022 +0000
Loosen nullability restrictions added in #3205 (#3226) (#3244)
* Loosen nullability restrictions added in #3205 (#3226)
* Fix tests
* More test fixes
* Yet more incorrect tests
* Review feedback
---
arrow-array/src/array/binary_array.rs | 2 +-
arrow-array/src/array/mod.rs | 4 +-
arrow-array/src/array/string_array.rs | 5 +-
arrow-array/src/array/struct_array.rs | 30 +++++-----
arrow-cast/src/cast.rs | 3 +-
arrow-data/src/data.rs | 103 +++++++++++++++++++++++++++++++++-
arrow-select/src/take.rs | 4 +-
arrow/src/compute/kernels/limit.rs | 4 +-
arrow/src/row/mod.rs | 35 ++----------
9 files changed, 133 insertions(+), 57 deletions(-)
diff --git a/arrow-array/src/array/binary_array.rs
b/arrow-array/src/array/binary_array.rs
index 0b526ecb3..3a30d748e 100644
--- a/arrow-array/src/array/binary_array.rs
+++ b/arrow-array/src/array/binary_array.rs
@@ -531,7 +531,7 @@ mod tests {
let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Box::new(
- Field::new("item", DataType::UInt8, false),
+ Field::new("item", DataType::UInt8, true),
));
// [None, Some(b"Parquet")]
diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index 0f9a2ce59..1e17e35d0 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -915,8 +915,10 @@ mod tests {
#[test]
fn test_null_struct() {
+ // It is possible to create a null struct containing a non-nullable
child
+ // see https://github.com/apache/arrow-rs/pull/3244 for details
let struct_type =
- DataType::Struct(vec![Field::new("data", DataType::Int64, true)]);
+ DataType::Struct(vec![Field::new("data", DataType::Int64, false)]);
let array = new_null_array(&struct_type, 9);
let a = array.as_any().downcast_ref::<StructArray>().unwrap();
diff --git a/arrow-array/src/array/string_array.rs
b/arrow-array/src/array/string_array.rs
index fb3bb2317..c8db589e3 100644
--- a/arrow-array/src/array/string_array.rs
+++ b/arrow-array/src/array/string_array.rs
@@ -608,8 +608,11 @@ mod tests {
.unwrap();
let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
+
+ // It is possible to create a null struct containing a non-nullable
child
+ // see https://github.com/apache/arrow-rs/pull/3244 for details
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Box::new(
- Field::new("item", DataType::UInt8, false),
+ Field::new("item", DataType::UInt8, true),
));
// [None, Some(b"Parquet")]
diff --git a/arrow-array/src/array/struct_array.rs
b/arrow-array/src/array/struct_array.rs
index 7d88cc5c6..bf6489c13 100644
--- a/arrow-array/src/array/struct_array.rs
+++ b/arrow-array/src/array/struct_array.rs
@@ -227,13 +227,6 @@ impl From<Vec<(Field, ArrayRef)>> for StructArray {
field_value.data().data_type(),
"the field data types must match the array data in a
StructArray"
);
- // Check nullability of child arrays
- if !field_type.is_nullable() {
- assert!(
- field_value.null_count() == 0,
- "non-nullable field cannot have null values"
- );
- }
},
);
@@ -241,6 +234,10 @@ impl From<Vec<(Field, ArrayRef)>> for StructArray {
.child_data(field_values.into_iter().map(|a|
a.into_data()).collect())
.len(length);
let array_data = unsafe { array_data.build_unchecked() };
+
+ // We must validate nullability
+ array_data.validate_nulls().unwrap();
+
Self::from(array_data)
}
}
@@ -283,13 +280,6 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer)> for
StructArray {
field_value.data().data_type(),
"the field data types must match the array data in a
StructArray"
);
- // Check nullability of child arrays
- if !field_type.is_nullable() {
- assert!(
- field_value.null_count() == 0,
- "non-nullable field cannot have null values"
- );
- }
},
);
@@ -298,6 +288,10 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer)> for
StructArray {
.child_data(field_values.into_iter().map(|a|
a.into_data()).collect())
.len(length);
let array_data = unsafe { array_data.build_unchecked() };
+
+ // We must validate nullability
+ array_data.validate_nulls().unwrap();
+
Self::from(array_data)
}
}
@@ -470,8 +464,8 @@ mod tests {
.unwrap();
let field_types = vec![
- Field::new("a", DataType::Boolean, false),
- Field::new("b", DataType::Int32, false),
+ Field::new("a", DataType::Boolean, true),
+ Field::new("b", DataType::Int32, true),
];
let struct_array_data =
ArrayData::builder(DataType::Struct(field_types))
.len(5)
@@ -568,7 +562,9 @@ mod tests {
}
#[test]
- #[should_panic(expected = "non-nullable field cannot have null values")]
+ #[should_panic(
+ expected = "non-nullable child of type Int32 contains nulls not
present in parent Struct"
+ )]
fn test_struct_array_from_mismatched_nullability() {
drop(StructArray::from(vec![(
Field::new("c", DataType::Int32, false),
diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs
index 272a422eb..7bb3aeb96 100644
--- a/arrow-cast/src/cast.rs
+++ b/arrow-cast/src/cast.rs
@@ -6594,7 +6594,8 @@ mod tests {
cast_from_null_to_other(&data_type);
// Cast null from and to struct
- let data_type = DataType::Struct(vec![Field::new("data",
DataType::Int64, true)]);
+ let data_type =
+ DataType::Struct(vec![Field::new("data", DataType::Int64, false)]);
cast_from_null_to_other(&data_type);
}
diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index b230dfdb7..b38321aac 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -19,6 +19,7 @@
//! common attributes and operations for Arrow array.
use crate::{bit_iterator::BitSliceIterator, bitmap::Bitmap};
+use arrow_buffer::bit_chunk_iterator::BitChunks;
use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer};
use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode};
use half::f16;
@@ -975,6 +976,7 @@ impl ArrayData {
/// see [`Self::validate_full`]
pub fn validate_data(&self) -> Result<(), ArrowError> {
self.validate()?;
+
self.validate_nulls()?;
self.validate_values()?;
Ok(())
@@ -1001,7 +1003,13 @@ impl ArrayData {
Ok(())
}
- /// Validates the the null count is correct
+ /// Validates the values stored within this [`ArrayData`] are valid
+ /// without recursing into child [`ArrayData`]
+ ///
+ /// Does not (yet) check
+ /// 1. Union type_ids are valid see
[#85](https://github.com/apache/arrow-rs/issues/85)
+ /// Validates the the null count is correct and that any
+ /// nullability requirements of its children are correct
pub fn validate_nulls(&self) -> Result<(), ArrowError> {
let nulls = self.null_buffer();
@@ -1012,9 +1020,102 @@ impl ArrayData {
self.null_count, actual_null_count
)));
}
+
+ // In general non-nullable children should not contain nulls, however,
for certain
+ // types, such as StructArray and FixedSizeList, nulls in the parent
take up
+ // space in the child. As such we permit nulls in the children in the
corresponding
+ // positions for such types
+ match &self.data_type {
+ DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _)
=> {
+ if !f.is_nullable() {
+ self.validate_non_nullable(None, 0, &self.child_data[0])?
+ }
+ }
+ DataType::FixedSizeList(field, len) => {
+ let child = &self.child_data[0];
+ if !field.is_nullable() {
+ match nulls {
+ Some(nulls) => {
+ let element_len = *len as usize;
+ let mut buffer =
+ MutableBuffer::new_null(element_len *
self.len);
+
+ // Expand each bit within `null_mask` into
`element_len`
+ // bits, constructing the implicit mask of the
child elements
+ for i in 0..self.len {
+ if !bit_util::get_bit(nulls.as_ref(),
self.offset + i) {
+ continue;
+ }
+ for j in 0..element_len {
+ bit_util::set_bit(
+ buffer.as_mut(),
+ i * element_len + j,
+ )
+ }
+ }
+ let mask = buffer.into();
+ self.validate_non_nullable(Some(&mask), 0, child)?;
+ }
+ None => self.validate_non_nullable(None, 0, child)?,
+ }
+ }
+ }
+ DataType::Struct(fields) => {
+ for (field, child) in fields.iter().zip(&self.child_data) {
+ if !field.is_nullable() {
+ self.validate_non_nullable(nulls, self.offset, child)?
+ }
+ }
+ }
+ _ => {}
+ }
+
Ok(())
}
+ /// Verifies that `child` contains no nulls not present in `mask`
+ fn validate_non_nullable(
+ &self,
+ mask: Option<&Buffer>,
+ offset: usize,
+ data: &ArrayData,
+ ) -> Result<(), ArrowError> {
+ let mask = match mask {
+ Some(mask) => mask.as_ref(),
+ None => return match data.null_count {
+ 0 => Ok(()),
+ _ => Err(ArrowError::InvalidArgumentError(format!(
+ "non-nullable child of type {} contains nulls not present
in parent {}",
+ data.data_type(),
+ self.data_type
+ ))),
+ },
+ };
+
+ match data.null_buffer() {
+ Some(nulls) => {
+ let mask = BitChunks::new(mask, offset, data.len);
+ let nulls = BitChunks::new(nulls.as_ref(), data.offset,
data.len);
+ mask
+ .iter()
+ .zip(nulls.iter())
+ .chain(std::iter::once((
+ mask.remainder_bits(),
+ nulls.remainder_bits(),
+ ))).try_for_each(|(m, c)| {
+ if (m & !c) != 0 {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "non-nullable child of type {} contains nulls not
present in parent",
+ data.data_type()
+ )))
+ }
+ Ok(())
+ })
+ }
+ None => Ok(()),
+ }
+ }
+
/// Validates the values stored within this [`ArrayData`] are valid
/// without recursing into child [`ArrayData`]
///
diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs
index 857b6e323..0b1d44319 100644
--- a/arrow-select/src/take.rs
+++ b/arrow-select/src/take.rs
@@ -1603,7 +1603,7 @@ mod tests {
let list_data_type = DataType::$list_data_type(Box::new(Field::new(
"item",
DataType::Int32,
- false,
+ true,
)));
let list_data = ArrayData::builder(list_data_type.clone())
.len(4)
@@ -1676,7 +1676,7 @@ mod tests {
let list_data_type = DataType::$list_data_type(Box::new(Field::new(
"item",
DataType::Int32,
- false,
+ true,
)));
let list_data = ArrayData::builder(list_data_type.clone())
.len(4)
diff --git a/arrow/src/compute/kernels/limit.rs
b/arrow/src/compute/kernels/limit.rs
index 1f6c6aec5..0d92e98cf 100644
--- a/arrow/src/compute/kernels/limit.rs
+++ b/arrow/src/compute/kernels/limit.rs
@@ -158,8 +158,8 @@ mod tests {
.unwrap();
let field_types = vec![
- Field::new("a", DataType::Boolean, false),
- Field::new("b", DataType::Int32, false),
+ Field::new("a", DataType::Boolean, true),
+ Field::new("b", DataType::Int32, true),
];
let struct_array_data =
ArrayData::builder(DataType::Struct(field_types))
.len(5)
diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs
index abb8039cc..ea3def6ac 100644
--- a/arrow/src/row/mod.rs
+++ b/arrow/src/row/mod.rs
@@ -1225,36 +1225,11 @@ unsafe fn decode_column(
}
}
Codec::Struct(converter, _) => {
- let child_fields = match &field.data_type {
- DataType::Struct(f) => f,
- _ => unreachable!(),
- };
-
let (null_count, nulls) = fixed::decode_nulls(rows);
rows.iter_mut().for_each(|row| *row = &row[1..]);
let children = converter.convert_raw(rows, validate_utf8)?;
- let child_data = child_fields
- .iter()
- .zip(&children)
- .map(|(f, c)| {
- let data = c.data().clone();
- match f.is_nullable() {
- true => data,
- false => {
- assert_eq!(data.null_count(), null_count);
- // Need to strip out null buffer if any as this is
created
- // as an artifact of the row encoding process that
encodes
- // nulls from the parent struct array in the
children
- data.into_builder()
- .null_count(0)
- .null_bit_buffer(None)
- .build_unchecked()
- }
- }
- })
- .collect();
-
+ let child_data = children.iter().map(|c|
c.data().clone()).collect();
let builder = ArrayDataBuilder::new(field.data_type.clone())
.len(rows.len())
.null_count(null_count)
@@ -1712,11 +1687,8 @@ mod tests {
let back = converter.convert_rows(&r2).unwrap();
assert_eq!(back.len(), 1);
assert_eq!(&back[0], &s2);
- let back_s = as_struct_array(&back[0]);
- for c in back_s.columns() {
- // Children should not contain nulls
- assert_eq!(c.null_count(), 0);
- }
+
+ back[0].data().validate_full().unwrap();
}
#[test]
@@ -2198,6 +2170,7 @@ mod tests {
let back = converter.convert_rows(&rows).unwrap();
for (actual, expected) in back.iter().zip(&arrays) {
+ actual.data().validate_full().unwrap();
assert_eq!(actual, expected)
}
}