This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 158e54da2cd Fix nested nullability when randomly generating arrays 
(#5713)
158e54da2cd is described below

commit 158e54da2cd8a2ac4e54744ad8861aded0d43281
Author: Alex Wilcoxson <[email protected]>
AuthorDate: Sat May 11 04:57:44 2024 -0500

    Fix nested nullability when randomly generating arrays (#5713)
    
    * fix: preserve null_density for nested types in create_random_array
    
    * fix: remove dbg statement
    
    * fix: clarify comment
    
    * fix: fmt
    
    * fix: nullable struct arrays respect null_density
    
    * test: support random MapArray
    
    * fix: Create StructArray directly vs using TryFrom/From to avoid changing 
DataTypes
---
 arrow/src/util/data_gen.rs | 317 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 278 insertions(+), 39 deletions(-)

diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs
index 36091f1c3ef..d563fa36240 100644
--- a/arrow/src/util/data_gen.rs
+++ b/arrow/src/util/data_gen.rs
@@ -59,44 +59,87 @@ pub fn create_random_array(
     true_density: f32,
 ) -> Result<ArrayRef> {
     // Override null density with 0.0 if the array is non-nullable
-    let null_density = match field.is_nullable() {
+    // and a primitive type in case a nested field is nullable
+    let primitive_null_density = match field.is_nullable() {
         true => null_density,
         false => 0.0,
     };
     use DataType::*;
     Ok(match field.data_type() {
         Null => Arc::new(NullArray::new(size)) as ArrayRef,
-        Boolean => Arc::new(create_boolean_array(size, null_density, 
true_density)),
-        Int8 => Arc::new(create_primitive_array::<Int8Type>(size, 
null_density)),
-        Int16 => Arc::new(create_primitive_array::<Int16Type>(size, 
null_density)),
-        Int32 => Arc::new(create_primitive_array::<Int32Type>(size, 
null_density)),
-        Int64 => Arc::new(create_primitive_array::<Int64Type>(size, 
null_density)),
-        UInt8 => Arc::new(create_primitive_array::<UInt8Type>(size, 
null_density)),
-        UInt16 => Arc::new(create_primitive_array::<UInt16Type>(size, 
null_density)),
-        UInt32 => Arc::new(create_primitive_array::<UInt32Type>(size, 
null_density)),
-        UInt64 => Arc::new(create_primitive_array::<UInt64Type>(size, 
null_density)),
+        Boolean => Arc::new(create_boolean_array(
+            size,
+            primitive_null_density,
+            true_density,
+        )),
+        Int8 => Arc::new(create_primitive_array::<Int8Type>(
+            size,
+            primitive_null_density,
+        )),
+        Int16 => Arc::new(create_primitive_array::<Int16Type>(
+            size,
+            primitive_null_density,
+        )),
+        Int32 => Arc::new(create_primitive_array::<Int32Type>(
+            size,
+            primitive_null_density,
+        )),
+        Int64 => Arc::new(create_primitive_array::<Int64Type>(
+            size,
+            primitive_null_density,
+        )),
+        UInt8 => Arc::new(create_primitive_array::<UInt8Type>(
+            size,
+            primitive_null_density,
+        )),
+        UInt16 => Arc::new(create_primitive_array::<UInt16Type>(
+            size,
+            primitive_null_density,
+        )),
+        UInt32 => Arc::new(create_primitive_array::<UInt32Type>(
+            size,
+            primitive_null_density,
+        )),
+        UInt64 => Arc::new(create_primitive_array::<UInt64Type>(
+            size,
+            primitive_null_density,
+        )),
         Float16 => {
             return Err(ArrowError::NotYetImplemented(
                 "Float16 is not implemented".to_string(),
             ))
         }
-        Float32 => Arc::new(create_primitive_array::<Float32Type>(size, 
null_density)),
-        Float64 => Arc::new(create_primitive_array::<Float64Type>(size, 
null_density)),
+        Float32 => Arc::new(create_primitive_array::<Float32Type>(
+            size,
+            primitive_null_density,
+        )),
+        Float64 => Arc::new(create_primitive_array::<Float64Type>(
+            size,
+            primitive_null_density,
+        )),
         Timestamp(_, _) => {
-            let int64_array =
-                Arc::new(create_primitive_array::<Int64Type>(size, 
null_density)) as ArrayRef;
+            let int64_array = Arc::new(create_primitive_array::<Int64Type>(
+                size,
+                primitive_null_density,
+            )) as ArrayRef;
             return crate::compute::cast(&int64_array, field.data_type());
         }
-        Date32 => Arc::new(create_primitive_array::<Date32Type>(size, 
null_density)),
-        Date64 => Arc::new(create_primitive_array::<Date64Type>(size, 
null_density)),
+        Date32 => Arc::new(create_primitive_array::<Date32Type>(
+            size,
+            primitive_null_density,
+        )),
+        Date64 => Arc::new(create_primitive_array::<Date64Type>(
+            size,
+            primitive_null_density,
+        )),
         Time32(unit) => match unit {
             TimeUnit::Second => 
Arc::new(create_primitive_array::<Time32SecondType>(
                 size,
-                null_density,
+                primitive_null_density,
             )) as ArrayRef,
             TimeUnit::Millisecond => 
Arc::new(create_primitive_array::<Time32MillisecondType>(
                 size,
-                null_density,
+                primitive_null_density,
             )),
             _ => {
                 return Err(ArrowError::InvalidArgumentError(format!(
@@ -107,11 +150,11 @@ pub fn create_random_array(
         Time64(unit) => match unit {
             TimeUnit::Microsecond => 
Arc::new(create_primitive_array::<Time64MicrosecondType>(
                 size,
-                null_density,
+                primitive_null_density,
             )) as ArrayRef,
             TimeUnit::Nanosecond => 
Arc::new(create_primitive_array::<Time64NanosecondType>(
                 size,
-                null_density,
+                primitive_null_density,
             )),
             _ => {
                 return Err(ArrowError::InvalidArgumentError(format!(
@@ -119,31 +162,28 @@ pub fn create_random_array(
                 )))
             }
         },
-        Utf8 => Arc::new(create_string_array::<i32>(size, null_density)),
-        LargeUtf8 => Arc::new(create_string_array::<i64>(size, null_density)),
+        Utf8 => Arc::new(create_string_array::<i32>(size, 
primitive_null_density)),
+        LargeUtf8 => Arc::new(create_string_array::<i64>(size, 
primitive_null_density)),
         Utf8View => Arc::new(create_string_view_array_with_len(
             size,
-            null_density,
+            primitive_null_density,
             4,
             false,
         )),
-        Binary => Arc::new(create_binary_array::<i32>(size, null_density)),
-        LargeBinary => Arc::new(create_binary_array::<i64>(size, 
null_density)),
-        FixedSizeBinary(len) => Arc::new(create_fsb_array(size, null_density, 
*len as usize)),
+        Binary => Arc::new(create_binary_array::<i32>(size, 
primitive_null_density)),
+        LargeBinary => Arc::new(create_binary_array::<i64>(size, 
primitive_null_density)),
+        FixedSizeBinary(len) => Arc::new(create_fsb_array(
+            size,
+            primitive_null_density,
+            *len as usize,
+        )),
         BinaryView => Arc::new(
-            create_string_view_array_with_len(size, null_density, 4, 
false).to_binary_view(),
+            create_string_view_array_with_len(size, primitive_null_density, 4, 
false)
+                .to_binary_view(),
         ),
         List(_) => create_random_list_array(field, size, null_density, 
true_density)?,
         LargeList(_) => create_random_list_array(field, size, null_density, 
true_density)?,
-        Struct(fields) => Arc::new(StructArray::try_from(
-            fields
-                .iter()
-                .map(|struct_field| {
-                    create_random_array(struct_field, size, null_density, 
true_density)
-                        .map(|array_ref| (struct_field.name().as_str(), 
array_ref))
-                })
-                .collect::<Result<Vec<(&str, ArrayRef)>>>()?,
-        )?),
+        Struct(_) => create_random_struct_array(field, size, null_density, 
true_density)?,
         d @ Dictionary(_, value_type) if 
crate::compute::can_cast_types(value_type, d) => {
             let f = Field::new(
                 field.name(),
@@ -153,6 +193,7 @@ pub fn create_random_array(
             let v = create_random_array(&f, size, null_density, true_density)?;
             crate::compute::cast(&v, d)?
         }
+        Map(_, _) => create_random_map_array(field, size, null_density, 
true_density)?,
         other => {
             return Err(ArrowError::NotYetImplemented(format!(
                 "Generating random arrays not yet implemented for {other:?}"
@@ -169,7 +210,7 @@ fn create_random_list_array(
     true_density: f32,
 ) -> Result<ArrayRef> {
     // Override null density with 0.0 if the array is non-nullable
-    let null_density = match field.is_nullable() {
+    let list_null_density = match field.is_nullable() {
         true => null_density,
         false => 0.0,
     };
@@ -197,7 +238,7 @@ fn create_random_list_array(
     let child_data = child_array.to_data();
     // Create list's null buffers, if it is nullable
     let null_buffer = match field.is_nullable() {
-        true => Some(create_random_null_buffer(size, null_density)),
+        true => Some(create_random_null_buffer(size, list_null_density)),
         false => None,
     };
     let list_data = unsafe {
@@ -214,6 +255,98 @@ fn create_random_list_array(
     Ok(make_array(list_data))
 }
 
+#[inline]
+fn create_random_struct_array(
+    field: &Field,
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<ArrayRef> {
+    let struct_fields = match field.data_type() {
+        DataType::Struct(fields) => fields,
+        _ => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Cannot create struct array for field {field:?}"
+            )))
+        }
+    };
+
+    let child_arrays = struct_fields
+        .iter()
+        .map(|struct_field| create_random_array(struct_field, size, 
null_density, true_density))
+        .collect::<Result<Vec<_>>>()?;
+
+    let null_buffer = match field.is_nullable() {
+        true => {
+            let nulls = arrow_buffer::BooleanBuffer::new(
+                create_random_null_buffer(size, null_density),
+                0,
+                size,
+            );
+            Some(nulls.into())
+        }
+        false => None,
+    };
+
+    Ok(Arc::new(StructArray::try_new(
+        struct_fields.clone(),
+        child_arrays,
+        null_buffer,
+    )?))
+}
+
+#[inline]
+fn create_random_map_array(
+    field: &Field,
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<ArrayRef> {
+    // Override null density with 0.0 if the array is non-nullable
+    let map_null_density = match field.is_nullable() {
+        true => null_density,
+        false => 0.0,
+    };
+
+    let entries_field = match field.data_type() {
+        DataType::Map(f, _) => f,
+        _ => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Cannot create map array for field {field:?}"
+            )))
+        }
+    };
+
+    let (offsets, child_len) = create_random_offsets::<i32>(size, 0, 5);
+    let offsets = Buffer::from(offsets.to_byte_slice());
+
+    let entries = create_random_array(
+        entries_field,
+        child_len as usize,
+        null_density,
+        true_density,
+    )?
+    .to_data();
+
+    let null_buffer = match field.is_nullable() {
+        true => Some(create_random_null_buffer(size, map_null_density)),
+        false => None,
+    };
+
+    let map_data = unsafe {
+        ArrayData::new_unchecked(
+            field.data_type().clone(),
+            size,
+            None,
+            null_buffer,
+            0,
+            vec![offsets],
+            vec![entries],
+        )
+    };
+    Ok(make_array(map_data))
+}
+
 /// Generate random offsets for list arrays
 fn create_random_offsets<T: OffsetSizeTrait + SampleUniform>(
     size: usize,
@@ -275,7 +408,7 @@ mod tests {
             Field::new("a", DataType::Int32, false),
             Field::new(
                 "b",
-                DataType::List(Arc::new(Field::new("item", 
DataType::LargeUtf8, true))),
+                DataType::List(Arc::new(Field::new("item", 
DataType::LargeUtf8, false))),
                 false,
             ),
             Field::new("a", DataType::Int32, false),
@@ -352,4 +485,110 @@ mod tests {
         assert_eq!(col_d_y.data_type(), &DataType::Float32);
         assert_eq!(col_d_y.null_count(), 0);
     }
+
+    #[test]
+    fn test_create_list_array_nested_nullability() {
+        let list_field = Field::new_list(
+            "not_null_list",
+            Field::new_list_field(DataType::Boolean, true),
+            false,
+        );
+
+        let list_array = create_random_array(&list_field, 100, 0.95, 
0.5).unwrap();
+
+        assert_eq!(list_array.null_count(), 0);
+        assert!(list_array.as_list::<i32>().values().null_count() > 0);
+    }
+
+    #[test]
+    fn test_create_struct_array_nested_nullability() {
+        let struct_child_fields = vec![
+            Field::new("null_int", DataType::Int32, true),
+            Field::new("int", DataType::Int32, false),
+        ];
+        let struct_field = Field::new_struct("not_null_struct", 
struct_child_fields, false);
+
+        let struct_array = create_random_array(&struct_field, 100, 0.95, 
0.5).unwrap();
+
+        assert_eq!(struct_array.null_count(), 0);
+        assert!(
+            struct_array
+                .as_struct()
+                .column_by_name("null_int")
+                .unwrap()
+                .null_count()
+                > 0
+        );
+        assert_eq!(
+            struct_array
+                .as_struct()
+                .column_by_name("int")
+                .unwrap()
+                .null_count(),
+            0
+        );
+    }
+
+    #[test]
+    fn test_create_list_array_nested_struct_nullability() {
+        let struct_child_fields = vec![
+            Field::new("null_int", DataType::Int32, true),
+            Field::new("int", DataType::Int32, false),
+        ];
+        let list_item_field =
+            
Field::new_list_field(DataType::Struct(struct_child_fields.into()), true);
+        let list_field = Field::new_list("not_null_list", list_item_field, 
false);
+
+        let list_array = create_random_array(&list_field, 100, 0.95, 
0.5).unwrap();
+
+        assert_eq!(list_array.null_count(), 0);
+        assert!(list_array.as_list::<i32>().values().null_count() > 0);
+        assert!(
+            list_array
+                .as_list::<i32>()
+                .values()
+                .as_struct()
+                .column_by_name("null_int")
+                .unwrap()
+                .null_count()
+                > 0
+        );
+        assert_eq!(
+            list_array
+                .as_list::<i32>()
+                .values()
+                .as_struct()
+                .column_by_name("int")
+                .unwrap()
+                .null_count(),
+            0
+        );
+    }
+
+    #[test]
+    fn test_create_map_array() {
+        let map_field = Field::new_map(
+            "map",
+            "entries",
+            Field::new("key", DataType::Utf8, false),
+            Field::new("value", DataType::Utf8, true),
+            false,
+            false,
+        );
+        let array = create_random_array(&map_field, 100, 0.8, 0.5).unwrap();
+
+        assert_eq!(array.len(), 100);
+        // Map field is not null
+        assert_eq!(array.null_count(), 0);
+        // Maps have multiple values like a list, so internal arrays are longer
+        assert!(array.as_map().keys().len() > array.len());
+        assert!(array.as_map().values().len() > array.len());
+        // Keys are not nullable
+        assert_eq!(array.as_map().keys().null_count(), 0);
+        // Values are nullable
+        assert!(array.as_map().values().null_count() > 0);
+
+        assert_eq!(array.as_map().keys().data_type(), &DataType::Utf8);
+        assert_eq!(array.as_map().values().data_type(), &DataType::Utf8);
+    }
 }

Reply via email to