This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 0d25340fb3 [Varint] Implement ShreddingState::AllNull variant (#8093)
0d25340fb3 is described below

commit 0d25340fb326f27f72dc883f709a8f43046d9280
Author: Yan Tingwang <[email protected]>
AuthorDate: Thu Aug 21 01:45:45 2025 +0800

    [Varint] Implement ShreddingState::AllNull variant (#8093)
    
    # Which issue does this PR close?
    
    - Closes #8088 .
    
    # What changes are included in this PR?
    
    handles the "all null" case
    
    # Are these changes tested?
    
    Yes.
    
    # Are there any user-facing changes?
    
    no.
    
    ---------
    
    Signed-off-by: codephage2020 <[email protected]>
    Co-authored-by: Ryan Johnson <[email protected]>
---
 parquet-variant-compute/src/variant_array.rs       | 134 +++++++++++++++++++--
 parquet-variant-compute/src/variant_get/mod.rs     |  73 +++++++++++
 .../src/variant_get/output/mod.rs                  |   7 ++
 .../src/variant_get/output/primitive.rs            |  16 ++-
 .../src/variant_get/output/variant.rs              |  13 ++
 5 files changed, 228 insertions(+), 15 deletions(-)

diff --git a/parquet-variant-compute/src/variant_array.rs 
b/parquet-variant-compute/src/variant_array.rs
index e715d0a6c0..c541258942 100644
--- a/parquet-variant-compute/src/variant_array.rs
+++ b/parquet-variant-compute/src/variant_array.rs
@@ -187,6 +187,13 @@ impl VariantArray {
                     typed_value_to_variant(typed_value, index)
                 }
             }
+            ShreddingState::AllNull { .. } => {
+                // NOTE: This handles the case where neither value nor 
typed_value fields exist.
+                // For top-level variants, this returns Variant::Null (JSON 
null).
+                // For shredded object fields, this technically should 
indicate SQL NULL,
+                // but the current API cannot distinguish these contexts.
+                Variant::Null
+            }
         }
     }
 
@@ -226,9 +233,6 @@ impl VariantArray {
 /// [Parquet Variant Shredding Spec]: 
https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
 #[derive(Debug)]
 pub enum ShreddingState {
-    // TODO: add missing state where there is neither value nor typed_value
-    // https://github.com/apache/arrow-rs/issues/8088
-    // Missing { metadata: BinaryViewArray },
     /// This variant has no typed_value field
     Unshredded {
         metadata: BinaryViewArray,
@@ -251,6 +255,13 @@ pub enum ShreddingState {
         value: BinaryViewArray,
         typed_value: ArrayRef,
     },
+    /// All values are null, only metadata is present.
+    ///
+    /// This state occurs when neither `value` nor `typed_value` fields exist 
in the schema.
+    /// Note: By strict spec interpretation, this should only be valid for 
shredded object fields,
+    /// not top-level variants. However, we allow it and treat as 
Variant::Null for pragmatic
+    /// handling of missing data.
+    AllNull { metadata: BinaryViewArray },
 }
 
 impl ShreddingState {
@@ -271,9 +282,7 @@ impl ShreddingState {
                 metadata,
                 typed_value,
             }),
-            (_metadata_field, None, None) => 
Err(ArrowError::InvalidArgumentError(String::from(
-                "VariantArray has neither value nor typed_value field",
-            ))),
+            (metadata, None, None) => Ok(Self::AllNull { metadata }),
         }
     }
 
@@ -283,6 +292,7 @@ impl ShreddingState {
             ShreddingState::Unshredded { metadata, .. } => metadata,
             ShreddingState::Typed { metadata, .. } => metadata,
             ShreddingState::PartiallyShredded { metadata, .. } => metadata,
+            ShreddingState::AllNull { metadata } => metadata,
         }
     }
 
@@ -292,6 +302,7 @@ impl ShreddingState {
             ShreddingState::Unshredded { value, .. } => Some(value),
             ShreddingState::Typed { .. } => None,
             ShreddingState::PartiallyShredded { value, .. } => Some(value),
+            ShreddingState::AllNull { .. } => None,
         }
     }
 
@@ -301,6 +312,7 @@ impl ShreddingState {
             ShreddingState::Unshredded { .. } => None,
             ShreddingState::Typed { typed_value, .. } => Some(typed_value),
             ShreddingState::PartiallyShredded { typed_value, .. } => 
Some(typed_value),
+            ShreddingState::AllNull { .. } => None,
         }
     }
 
@@ -327,6 +339,9 @@ impl ShreddingState {
                 value: value.slice(offset, length),
                 typed_value: typed_value.slice(offset, length),
             },
+            ShreddingState::AllNull { metadata } => ShreddingState::AllNull {
+                metadata: metadata.slice(offset, length),
+            },
         }
     }
 }
@@ -435,15 +450,27 @@ mod test {
     }
 
     #[test]
-    fn invalid_missing_value() {
+    fn all_null_missing_value_and_typed_value() {
         let fields = Fields::from(vec![Field::new("metadata", 
DataType::BinaryView, false)]);
         let array = StructArray::new(fields, vec![make_binary_view_array()], 
None);
-        // Should fail because the StructArray does not contain a 'value' field
-        let err = VariantArray::try_new(Arc::new(array));
-        assert_eq!(
-            err.unwrap_err().to_string(),
-            "Invalid argument error: VariantArray has neither value nor 
typed_value field"
-        );
+
+        // NOTE: By strict spec interpretation, this case (top-level variant 
with null/null)
+        // should be invalid, but we currently allow it and treat it as 
Variant::Null.
+        // This is a pragmatic decision to handle missing data gracefully.
+        let variant_array = VariantArray::try_new(Arc::new(array)).unwrap();
+
+        // Verify the shredding state is AllNull
+        assert!(matches!(
+            variant_array.shredding_state(),
+            ShreddingState::AllNull { .. }
+        ));
+
+        // Verify that value() returns Variant::Null (compensating for spec 
violation)
+        for i in 0..variant_array.len() {
+            if variant_array.is_valid(i) {
+                assert_eq!(variant_array.value(i), 
parquet_variant::Variant::Null);
+            }
+        }
     }
 
     #[test]
@@ -489,4 +516,85 @@ mod test {
     fn make_binary_array() -> ArrayRef {
         Arc::new(BinaryArray::from(vec![b"test" as &[u8]]))
     }
+
+    #[test]
+    fn all_null_shredding_state() {
+        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]]);
+        let shredding_state = ShreddingState::try_new(metadata.clone(), None, 
None).unwrap();
+
+        assert!(matches!(shredding_state, ShreddingState::AllNull { .. }));
+
+        // Verify metadata is preserved correctly
+        if let ShreddingState::AllNull { metadata: m } = shredding_state {
+            assert_eq!(m.len(), metadata.len());
+            assert_eq!(m.value(0), metadata.value(0));
+        }
+    }
+
+    #[test]
+    fn all_null_variant_array_construction() {
+        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
+        let nulls = NullBuffer::from(vec![false, false, false]); // all null
+
+        let fields = Fields::from(vec![Field::new("metadata", 
DataType::BinaryView, false)]);
+        let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], 
Some(nulls));
+
+        let variant_array = 
VariantArray::try_new(Arc::new(struct_array)).unwrap();
+
+        // Verify the shredding state is AllNull
+        assert!(matches!(
+            variant_array.shredding_state(),
+            ShreddingState::AllNull { .. }
+        ));
+
+        // Verify all values are null
+        assert_eq!(variant_array.len(), 3);
+        assert!(!variant_array.is_valid(0));
+        assert!(!variant_array.is_valid(1));
+        assert!(!variant_array.is_valid(2));
+
+        // Verify that value() returns Variant::Null for all indices
+        for i in 0..variant_array.len() {
+            assert!(
+                !variant_array.is_valid(i),
+                "Expected value at index {i} to be null"
+            );
+        }
+    }
+
+    #[test]
+    fn value_field_present_but_all_null_should_be_unshredded() {
+        // This test demonstrates the issue: when a value field exists in 
schema
+        // but all its values are null, it should remain Unshredded, not 
AllNull
+        let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
+
+        // Create a value field with all null values
+        let value_nulls = NullBuffer::from(vec![false, false, false]); // all 
null
+        let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
+        let value_data = value_array
+            .to_data()
+            .into_builder()
+            .nulls(Some(value_nulls))
+            .build()
+            .unwrap();
+        let value = BinaryViewArray::from(value_data);
+
+        let fields = Fields::from(vec![
+            Field::new("metadata", DataType::BinaryView, false),
+            Field::new("value", DataType::BinaryView, true), // Field exists 
in schema
+        ]);
+        let struct_array = StructArray::new(
+            fields,
+            vec![Arc::new(metadata), Arc::new(value)],
+            None, // struct itself is not null, just the value field is all 
null
+        );
+
+        let variant_array = 
VariantArray::try_new(Arc::new(struct_array)).unwrap();
+
+        // This should be Unshredded, not AllNull, because value field exists 
in schema
+        assert!(matches!(
+            variant_array.shredding_state(),
+            ShreddingState::Unshredded { .. }
+        ));
+    }
 }
diff --git a/parquet-variant-compute/src/variant_get/mod.rs 
b/parquet-variant-compute/src/variant_get/mod.rs
index 6812a17483..0c9d2686c0 100644
--- a/parquet-variant-compute/src/variant_get/mod.rs
+++ b/parquet-variant-compute/src/variant_get/mod.rs
@@ -58,6 +58,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> 
Result<ArrayRef> {
         ShreddingState::Unshredded { metadata, value } => {
             output_builder.unshredded(variant_array, metadata, value)
         }
+        ShreddingState::AllNull { metadata } => 
output_builder.all_null(variant_array, metadata),
     }
 }
 
@@ -284,6 +285,40 @@ mod test {
         assert_eq!(&result, &expected)
     }
 
+    /// AllNull: extract a value as a VariantArray
+    #[test]
+    fn get_variant_all_null_as_variant() {
+        let array = all_null_variant_array();
+        let options = GetOptions::new();
+        let result = variant_get(&array, options).unwrap();
+
+        // expect the result is a VariantArray
+        let result: &VariantArray = result.as_any().downcast_ref().unwrap();
+        assert_eq!(result.len(), 3);
+
+        // All values should be null
+        assert!(!result.is_valid(0));
+        assert!(!result.is_valid(1));
+        assert!(!result.is_valid(2));
+    }
+
+    /// AllNull: extract a value as an Int32Array
+    #[test]
+    fn get_variant_all_null_as_int32() {
+        let array = all_null_variant_array();
+        // specify we want the typed value as Int32
+        let field = Field::new("typed_value", DataType::Int32, true);
+        let options = 
GetOptions::new().with_as_type(Some(FieldRef::from(field)));
+        let result = variant_get(&array, options).unwrap();
+
+        let expected: ArrayRef = Arc::new(Int32Array::from(vec![
+            Option::<i32>::None,
+            Option::<i32>::None,
+            Option::<i32>::None,
+        ]));
+        assert_eq!(&result, &expected)
+    }
+
     /// Return a VariantArray that represents a perfectly "shredded" variant
     /// for the following example (3 Variant::Int32 values):
     ///
@@ -427,4 +462,42 @@ mod test {
             StructArray::new(Fields::from(fields), arrays, nulls)
         }
     }
+
+    /// Return a VariantArray that represents an "all null" variant
+    /// for the following example (3 null values):
+    ///
+    /// ```text
+    /// null
+    /// null  
+    /// null
+    /// ```
+    ///
+    /// The schema of the corresponding `StructArray` would look like this:
+    ///
+    /// ```text
+    /// StructArray {
+    ///   metadata: BinaryViewArray,
+    /// }
+    /// ```
+    fn all_null_variant_array() -> ArrayRef {
+        let (metadata, _value) = { 
parquet_variant::VariantBuilder::new().finish() };
+
+        let nulls = NullBuffer::from(vec![
+            false, // row 0 is null
+            false, // row 1 is null
+            false, // row 2 is null
+        ]);
+
+        // metadata is the same for all rows (though they're all null)
+        let metadata = 
BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3));
+
+        let struct_array = StructArrayBuilder::new()
+            .with_field("metadata", Arc::new(metadata))
+            .with_nulls(nulls)
+            .build();
+
+        Arc::new(
+            VariantArray::try_new(Arc::new(struct_array)).expect("should 
create variant array"),
+        )
+    }
 }
diff --git a/parquet-variant-compute/src/variant_get/output/mod.rs 
b/parquet-variant-compute/src/variant_get/output/mod.rs
index 245d73cce8..52a8f5bc02 100644
--- a/parquet-variant-compute/src/variant_get/output/mod.rs
+++ b/parquet-variant-compute/src/variant_get/output/mod.rs
@@ -58,6 +58,13 @@ pub(crate) trait OutputBuilder {
         metadata: &BinaryViewArray,
         value_field: &BinaryViewArray,
     ) -> Result<ArrayRef>;
+
+    /// write out an all-null variant array
+    fn all_null(
+        &self,
+        variant_array: &VariantArray,
+        metadata: &BinaryViewArray,
+    ) -> Result<ArrayRef>;
 }
 
 pub(crate) fn instantiate_output_builder<'a>(
diff --git a/parquet-variant-compute/src/variant_get/output/primitive.rs 
b/parquet-variant-compute/src/variant_get/output/primitive.rs
index 496d711c10..aabc9827a7 100644
--- a/parquet-variant-compute/src/variant_get/output/primitive.rs
+++ b/parquet-variant-compute/src/variant_get/output/primitive.rs
@@ -20,8 +20,8 @@ use crate::VariantArray;
 use arrow::error::Result;
 
 use arrow::array::{
-    Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, 
NullBufferBuilder,
-    PrimitiveArray,
+    new_null_array, Array, ArrayRef, ArrowPrimitiveType, AsArray, 
BinaryViewArray,
+    NullBufferBuilder, PrimitiveArray,
 };
 use arrow::compute::{cast_with_options, CastOptions};
 use arrow::datatypes::Int32Type;
@@ -157,6 +157,18 @@ impl<T: ArrowPrimitiveVariant> OutputBuilder for 
PrimitiveOutputBuilder<'_, T> {
             "variant_get unshredded to primitive types is not implemented yet",
         )))
     }
+
+    fn all_null(
+        &self,
+        variant_array: &VariantArray,
+        _metadata: &BinaryViewArray,
+    ) -> Result<ArrayRef> {
+        // For all-null case, create a primitive array with all null values
+        Ok(Arc::new(new_null_array(
+            self.as_type.data_type(),
+            variant_array.len(),
+        )))
+    }
 }
 
 impl ArrowPrimitiveVariant for Int32Type {
diff --git a/parquet-variant-compute/src/variant_get/output/variant.rs 
b/parquet-variant-compute/src/variant_get/output/variant.rs
index 6f2f829b66..7c8b4da2f5 100644
--- a/parquet-variant-compute/src/variant_get/output/variant.rs
+++ b/parquet-variant-compute/src/variant_get/output/variant.rs
@@ -145,4 +145,17 @@ impl OutputBuilder for VariantOutputBuilder<'_> {
 
         Ok(Arc::new(builder.build()))
     }
+
+    fn all_null(
+        &self,
+        variant_array: &VariantArray,
+        _metadata: &BinaryViewArray,
+    ) -> arrow::error::Result<ArrayRef> {
+        // For all-null case, simply create a VariantArray with all null values
+        let mut builder = VariantArrayBuilder::new(variant_array.len());
+        for _i in 0..variant_array.len() {
+            builder.append_null();
+        }
+        Ok(Arc::new(builder.build()))
+    }
 }

Reply via email to