This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 158e54da2cd Fix nested nullability when randomly generating arrays
(#5713)
158e54da2cd is described below
commit 158e54da2cd8a2ac4e54744ad8861aded0d43281
Author: Alex Wilcoxson <[email protected]>
AuthorDate: Sat May 11 04:57:44 2024 -0500
Fix nested nullability when randomly generating arrays (#5713)
* fix: preserve null_density for nested types in create_random_array
* fix: remove dbg statement
* fix: clarify comment
* fix: fmt
* fix: nullable struct arrays respect null_density
* test: support random MapArray
* fix: Create StructArray directly vs using TryFrom/From to avoid changing
DataTypes
---
arrow/src/util/data_gen.rs | 317 +++++++++++++++++++++++++++++++++++++++------
1 file changed, 278 insertions(+), 39 deletions(-)
diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs
index 36091f1c3ef..d563fa36240 100644
--- a/arrow/src/util/data_gen.rs
+++ b/arrow/src/util/data_gen.rs
@@ -59,44 +59,87 @@ pub fn create_random_array(
true_density: f32,
) -> Result<ArrayRef> {
// Override null density with 0.0 if the array is non-nullable
- let null_density = match field.is_nullable() {
+ // and a primitive type in case a nested field is nullable
+ let primitive_null_density = match field.is_nullable() {
true => null_density,
false => 0.0,
};
use DataType::*;
Ok(match field.data_type() {
Null => Arc::new(NullArray::new(size)) as ArrayRef,
- Boolean => Arc::new(create_boolean_array(size, null_density,
true_density)),
- Int8 => Arc::new(create_primitive_array::<Int8Type>(size,
null_density)),
- Int16 => Arc::new(create_primitive_array::<Int16Type>(size,
null_density)),
- Int32 => Arc::new(create_primitive_array::<Int32Type>(size,
null_density)),
- Int64 => Arc::new(create_primitive_array::<Int64Type>(size,
null_density)),
- UInt8 => Arc::new(create_primitive_array::<UInt8Type>(size,
null_density)),
- UInt16 => Arc::new(create_primitive_array::<UInt16Type>(size,
null_density)),
- UInt32 => Arc::new(create_primitive_array::<UInt32Type>(size,
null_density)),
- UInt64 => Arc::new(create_primitive_array::<UInt64Type>(size,
null_density)),
+ Boolean => Arc::new(create_boolean_array(
+ size,
+ primitive_null_density,
+ true_density,
+ )),
+ Int8 => Arc::new(create_primitive_array::<Int8Type>(
+ size,
+ primitive_null_density,
+ )),
+ Int16 => Arc::new(create_primitive_array::<Int16Type>(
+ size,
+ primitive_null_density,
+ )),
+ Int32 => Arc::new(create_primitive_array::<Int32Type>(
+ size,
+ primitive_null_density,
+ )),
+ Int64 => Arc::new(create_primitive_array::<Int64Type>(
+ size,
+ primitive_null_density,
+ )),
+ UInt8 => Arc::new(create_primitive_array::<UInt8Type>(
+ size,
+ primitive_null_density,
+ )),
+ UInt16 => Arc::new(create_primitive_array::<UInt16Type>(
+ size,
+ primitive_null_density,
+ )),
+ UInt32 => Arc::new(create_primitive_array::<UInt32Type>(
+ size,
+ primitive_null_density,
+ )),
+ UInt64 => Arc::new(create_primitive_array::<UInt64Type>(
+ size,
+ primitive_null_density,
+ )),
Float16 => {
return Err(ArrowError::NotYetImplemented(
"Float16 is not implemented".to_string(),
))
}
- Float32 => Arc::new(create_primitive_array::<Float32Type>(size,
null_density)),
- Float64 => Arc::new(create_primitive_array::<Float64Type>(size,
null_density)),
+ Float32 => Arc::new(create_primitive_array::<Float32Type>(
+ size,
+ primitive_null_density,
+ )),
+ Float64 => Arc::new(create_primitive_array::<Float64Type>(
+ size,
+ primitive_null_density,
+ )),
Timestamp(_, _) => {
- let int64_array =
- Arc::new(create_primitive_array::<Int64Type>(size,
null_density)) as ArrayRef;
+ let int64_array = Arc::new(create_primitive_array::<Int64Type>(
+ size,
+ primitive_null_density,
+ )) as ArrayRef;
return crate::compute::cast(&int64_array, field.data_type());
}
- Date32 => Arc::new(create_primitive_array::<Date32Type>(size,
null_density)),
- Date64 => Arc::new(create_primitive_array::<Date64Type>(size,
null_density)),
+ Date32 => Arc::new(create_primitive_array::<Date32Type>(
+ size,
+ primitive_null_density,
+ )),
+ Date64 => Arc::new(create_primitive_array::<Date64Type>(
+ size,
+ primitive_null_density,
+ )),
Time32(unit) => match unit {
TimeUnit::Second =>
Arc::new(create_primitive_array::<Time32SecondType>(
size,
- null_density,
+ primitive_null_density,
)) as ArrayRef,
TimeUnit::Millisecond =>
Arc::new(create_primitive_array::<Time32MillisecondType>(
size,
- null_density,
+ primitive_null_density,
)),
_ => {
return Err(ArrowError::InvalidArgumentError(format!(
@@ -107,11 +150,11 @@ pub fn create_random_array(
Time64(unit) => match unit {
TimeUnit::Microsecond =>
Arc::new(create_primitive_array::<Time64MicrosecondType>(
size,
- null_density,
+ primitive_null_density,
)) as ArrayRef,
TimeUnit::Nanosecond =>
Arc::new(create_primitive_array::<Time64NanosecondType>(
size,
- null_density,
+ primitive_null_density,
)),
_ => {
return Err(ArrowError::InvalidArgumentError(format!(
@@ -119,31 +162,28 @@ pub fn create_random_array(
)))
}
},
- Utf8 => Arc::new(create_string_array::<i32>(size, null_density)),
- LargeUtf8 => Arc::new(create_string_array::<i64>(size, null_density)),
+ Utf8 => Arc::new(create_string_array::<i32>(size,
primitive_null_density)),
+ LargeUtf8 => Arc::new(create_string_array::<i64>(size,
primitive_null_density)),
Utf8View => Arc::new(create_string_view_array_with_len(
size,
- null_density,
+ primitive_null_density,
4,
false,
)),
- Binary => Arc::new(create_binary_array::<i32>(size, null_density)),
- LargeBinary => Arc::new(create_binary_array::<i64>(size,
null_density)),
- FixedSizeBinary(len) => Arc::new(create_fsb_array(size, null_density,
*len as usize)),
+ Binary => Arc::new(create_binary_array::<i32>(size,
primitive_null_density)),
+ LargeBinary => Arc::new(create_binary_array::<i64>(size,
primitive_null_density)),
+ FixedSizeBinary(len) => Arc::new(create_fsb_array(
+ size,
+ primitive_null_density,
+ *len as usize,
+ )),
BinaryView => Arc::new(
- create_string_view_array_with_len(size, null_density, 4,
false).to_binary_view(),
+ create_string_view_array_with_len(size, primitive_null_density, 4,
false)
+ .to_binary_view(),
),
List(_) => create_random_list_array(field, size, null_density,
true_density)?,
LargeList(_) => create_random_list_array(field, size, null_density,
true_density)?,
- Struct(fields) => Arc::new(StructArray::try_from(
- fields
- .iter()
- .map(|struct_field| {
- create_random_array(struct_field, size, null_density,
true_density)
- .map(|array_ref| (struct_field.name().as_str(),
array_ref))
- })
- .collect::<Result<Vec<(&str, ArrayRef)>>>()?,
- )?),
+ Struct(_) => create_random_struct_array(field, size, null_density,
true_density)?,
d @ Dictionary(_, value_type) if
crate::compute::can_cast_types(value_type, d) => {
let f = Field::new(
field.name(),
@@ -153,6 +193,7 @@ pub fn create_random_array(
let v = create_random_array(&f, size, null_density, true_density)?;
crate::compute::cast(&v, d)?
}
+ Map(_, _) => create_random_map_array(field, size, null_density,
true_density)?,
other => {
return Err(ArrowError::NotYetImplemented(format!(
"Generating random arrays not yet implemented for {other:?}"
@@ -169,7 +210,7 @@ fn create_random_list_array(
true_density: f32,
) -> Result<ArrayRef> {
// Override null density with 0.0 if the array is non-nullable
- let null_density = match field.is_nullable() {
+ let list_null_density = match field.is_nullable() {
true => null_density,
false => 0.0,
};
@@ -197,7 +238,7 @@ fn create_random_list_array(
let child_data = child_array.to_data();
// Create list's null buffers, if it is nullable
let null_buffer = match field.is_nullable() {
- true => Some(create_random_null_buffer(size, null_density)),
+ true => Some(create_random_null_buffer(size, list_null_density)),
false => None,
};
let list_data = unsafe {
@@ -214,6 +255,98 @@ fn create_random_list_array(
Ok(make_array(list_data))
}
+#[inline]
+fn create_random_struct_array(
+ field: &Field,
+ size: usize,
+ null_density: f32,
+ true_density: f32,
+) -> Result<ArrayRef> {
+ let struct_fields = match field.data_type() {
+ DataType::Struct(fields) => fields,
+ _ => {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Cannot create struct array for field {field:?}"
+ )))
+ }
+ };
+
+ let child_arrays = struct_fields
+ .iter()
+ .map(|struct_field| create_random_array(struct_field, size,
null_density, true_density))
+ .collect::<Result<Vec<_>>>()?;
+
+ let null_buffer = match field.is_nullable() {
+ true => {
+ let nulls = arrow_buffer::BooleanBuffer::new(
+ create_random_null_buffer(size, null_density),
+ 0,
+ size,
+ );
+ Some(nulls.into())
+ }
+ false => None,
+ };
+
+ Ok(Arc::new(StructArray::try_new(
+ struct_fields.clone(),
+ child_arrays,
+ null_buffer,
+ )?))
+}
+
+#[inline]
+fn create_random_map_array(
+ field: &Field,
+ size: usize,
+ null_density: f32,
+ true_density: f32,
+) -> Result<ArrayRef> {
+ // Override null density with 0.0 if the array is non-nullable
+ let map_null_density = match field.is_nullable() {
+ true => null_density,
+ false => 0.0,
+ };
+
+ let entries_field = match field.data_type() {
+ DataType::Map(f, _) => f,
+ _ => {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Cannot create map array for field {field:?}"
+ )))
+ }
+ };
+
+ let (offsets, child_len) = create_random_offsets::<i32>(size, 0, 5);
+ let offsets = Buffer::from(offsets.to_byte_slice());
+
+ let entries = create_random_array(
+ entries_field,
+ child_len as usize,
+ null_density,
+ true_density,
+ )?
+ .to_data();
+
+ let null_buffer = match field.is_nullable() {
+ true => Some(create_random_null_buffer(size, map_null_density)),
+ false => None,
+ };
+
+ let map_data = unsafe {
+ ArrayData::new_unchecked(
+ field.data_type().clone(),
+ size,
+ None,
+ null_buffer,
+ 0,
+ vec![offsets],
+ vec![entries],
+ )
+ };
+ Ok(make_array(map_data))
+}
+
/// Generate random offsets for list arrays
fn create_random_offsets<T: OffsetSizeTrait + SampleUniform>(
size: usize,
@@ -275,7 +408,7 @@ mod tests {
Field::new("a", DataType::Int32, false),
Field::new(
"b",
- DataType::List(Arc::new(Field::new("item",
DataType::LargeUtf8, true))),
+ DataType::List(Arc::new(Field::new("item",
DataType::LargeUtf8, false))),
false,
),
Field::new("a", DataType::Int32, false),
@@ -352,4 +485,110 @@ mod tests {
assert_eq!(col_d_y.data_type(), &DataType::Float32);
assert_eq!(col_d_y.null_count(), 0);
}
+
+ #[test]
+ fn test_create_list_array_nested_nullability() {
+ let list_field = Field::new_list(
+ "not_null_list",
+ Field::new_list_field(DataType::Boolean, true),
+ false,
+ );
+
+ let list_array = create_random_array(&list_field, 100, 0.95,
0.5).unwrap();
+
+ assert_eq!(list_array.null_count(), 0);
+ assert!(list_array.as_list::<i32>().values().null_count() > 0);
+ }
+
+ #[test]
+ fn test_create_struct_array_nested_nullability() {
+ let struct_child_fields = vec![
+ Field::new("null_int", DataType::Int32, true),
+ Field::new("int", DataType::Int32, false),
+ ];
+ let struct_field = Field::new_struct("not_null_struct",
struct_child_fields, false);
+
+ let struct_array = create_random_array(&struct_field, 100, 0.95,
0.5).unwrap();
+
+ assert_eq!(struct_array.null_count(), 0);
+ assert!(
+ struct_array
+ .as_struct()
+ .column_by_name("null_int")
+ .unwrap()
+ .null_count()
+ > 0
+ );
+ assert_eq!(
+ struct_array
+ .as_struct()
+ .column_by_name("int")
+ .unwrap()
+ .null_count(),
+ 0
+ );
+ }
+
+ #[test]
+ fn test_create_list_array_nested_struct_nullability() {
+ let struct_child_fields = vec![
+ Field::new("null_int", DataType::Int32, true),
+ Field::new("int", DataType::Int32, false),
+ ];
+ let list_item_field =
+
Field::new_list_field(DataType::Struct(struct_child_fields.into()), true);
+ let list_field = Field::new_list("not_null_list", list_item_field,
false);
+
+ let list_array = create_random_array(&list_field, 100, 0.95,
0.5).unwrap();
+
+ assert_eq!(list_array.null_count(), 0);
+ assert!(list_array.as_list::<i32>().values().null_count() > 0);
+ assert!(
+ list_array
+ .as_list::<i32>()
+ .values()
+ .as_struct()
+ .column_by_name("null_int")
+ .unwrap()
+ .null_count()
+ > 0
+ );
+ assert_eq!(
+ list_array
+ .as_list::<i32>()
+ .values()
+ .as_struct()
+ .column_by_name("int")
+ .unwrap()
+ .null_count(),
+ 0
+ );
+ }
+
+ #[test]
+ fn test_create_map_array() {
+ let map_field = Field::new_map(
+ "map",
+ "entries",
+ Field::new("key", DataType::Utf8, false),
+ Field::new("value", DataType::Utf8, true),
+ false,
+ false,
+ );
+ let array = create_random_array(&map_field, 100, 0.8, 0.5).unwrap();
+
+ assert_eq!(array.len(), 100);
+ // Map field is not null
+ assert_eq!(array.null_count(), 0);
+ // Maps have multiple values like a list, so internal arrays are longer
+ assert!(array.as_map().keys().len() > array.len());
+ assert!(array.as_map().values().len() > array.len());
+ // Keys are not nullable
+ assert_eq!(array.as_map().keys().null_count(), 0);
+ // Values are nullable
+ assert!(array.as_map().values().null_count() > 0);
+
+ assert_eq!(array.as_map().keys().data_type(), &DataType::Utf8);
+ assert_eq!(array.as_map().values().data_type(), &DataType::Utf8);
+ }
}