This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 0d25340fb3 [Varint] Implement ShreddingState::AllNull variant (#8093)
0d25340fb3 is described below
commit 0d25340fb326f27f72dc883f709a8f43046d9280
Author: Yan Tingwang <[email protected]>
AuthorDate: Thu Aug 21 01:45:45 2025 +0800
[Varint] Implement ShreddingState::AllNull variant (#8093)
# Which issue does this PR close?
- Closes #8088 .
# What changes are included in this PR?
handles the "all null" case
# Are these changes tested?
Yes.
# Are there any user-facing changes?
no.
---------
Signed-off-by: codephage2020 <[email protected]>
Co-authored-by: Ryan Johnson <[email protected]>
---
parquet-variant-compute/src/variant_array.rs | 134 +++++++++++++++++++--
parquet-variant-compute/src/variant_get/mod.rs | 73 +++++++++++
.../src/variant_get/output/mod.rs | 7 ++
.../src/variant_get/output/primitive.rs | 16 ++-
.../src/variant_get/output/variant.rs | 13 ++
5 files changed, 228 insertions(+), 15 deletions(-)
diff --git a/parquet-variant-compute/src/variant_array.rs
b/parquet-variant-compute/src/variant_array.rs
index e715d0a6c0..c541258942 100644
--- a/parquet-variant-compute/src/variant_array.rs
+++ b/parquet-variant-compute/src/variant_array.rs
@@ -187,6 +187,13 @@ impl VariantArray {
typed_value_to_variant(typed_value, index)
}
}
+ ShreddingState::AllNull { .. } => {
+ // NOTE: This handles the case where neither value nor
typed_value fields exist.
+ // For top-level variants, this returns Variant::Null (JSON
null).
+ // For shredded object fields, this technically should
indicate SQL NULL,
+ // but the current API cannot distinguish these contexts.
+ Variant::Null
+ }
}
}
@@ -226,9 +233,6 @@ impl VariantArray {
/// [Parquet Variant Shredding Spec]:
https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
#[derive(Debug)]
pub enum ShreddingState {
- // TODO: add missing state where there is neither value nor typed_value
- // https://github.com/apache/arrow-rs/issues/8088
- // Missing { metadata: BinaryViewArray },
/// This variant has no typed_value field
Unshredded {
metadata: BinaryViewArray,
@@ -251,6 +255,13 @@ pub enum ShreddingState {
value: BinaryViewArray,
typed_value: ArrayRef,
},
+ /// All values are null, only metadata is present.
+ ///
+ /// This state occurs when neither `value` nor `typed_value` fields exist
in the schema.
+ /// Note: By strict spec interpretation, this should only be valid for
shredded object fields,
+ /// not top-level variants. However, we allow it and treat as
Variant::Null for pragmatic
+ /// handling of missing data.
+ AllNull { metadata: BinaryViewArray },
}
impl ShreddingState {
@@ -271,9 +282,7 @@ impl ShreddingState {
metadata,
typed_value,
}),
- (_metadata_field, None, None) =>
Err(ArrowError::InvalidArgumentError(String::from(
- "VariantArray has neither value nor typed_value field",
- ))),
+ (metadata, None, None) => Ok(Self::AllNull { metadata }),
}
}
@@ -283,6 +292,7 @@ impl ShreddingState {
ShreddingState::Unshredded { metadata, .. } => metadata,
ShreddingState::Typed { metadata, .. } => metadata,
ShreddingState::PartiallyShredded { metadata, .. } => metadata,
+ ShreddingState::AllNull { metadata } => metadata,
}
}
@@ -292,6 +302,7 @@ impl ShreddingState {
ShreddingState::Unshredded { value, .. } => Some(value),
ShreddingState::Typed { .. } => None,
ShreddingState::PartiallyShredded { value, .. } => Some(value),
+ ShreddingState::AllNull { .. } => None,
}
}
@@ -301,6 +312,7 @@ impl ShreddingState {
ShreddingState::Unshredded { .. } => None,
ShreddingState::Typed { typed_value, .. } => Some(typed_value),
ShreddingState::PartiallyShredded { typed_value, .. } =>
Some(typed_value),
+ ShreddingState::AllNull { .. } => None,
}
}
@@ -327,6 +339,9 @@ impl ShreddingState {
value: value.slice(offset, length),
typed_value: typed_value.slice(offset, length),
},
+ ShreddingState::AllNull { metadata } => ShreddingState::AllNull {
+ metadata: metadata.slice(offset, length),
+ },
}
}
}
@@ -435,15 +450,27 @@ mod test {
}
#[test]
- fn invalid_missing_value() {
+ fn all_null_missing_value_and_typed_value() {
let fields = Fields::from(vec![Field::new("metadata",
DataType::BinaryView, false)]);
let array = StructArray::new(fields, vec![make_binary_view_array()],
None);
- // Should fail because the StructArray does not contain a 'value' field
- let err = VariantArray::try_new(Arc::new(array));
- assert_eq!(
- err.unwrap_err().to_string(),
- "Invalid argument error: VariantArray has neither value nor
typed_value field"
- );
+
+ // NOTE: By strict spec interpretation, this case (top-level variant
with null/null)
+ // should be invalid, but we currently allow it and treat it as
Variant::Null.
+ // This is a pragmatic decision to handle missing data gracefully.
+ let variant_array = VariantArray::try_new(Arc::new(array)).unwrap();
+
+ // Verify the shredding state is AllNull
+ assert!(matches!(
+ variant_array.shredding_state(),
+ ShreddingState::AllNull { .. }
+ ));
+
+ // Verify that value() returns Variant::Null (compensating for spec
violation)
+ for i in 0..variant_array.len() {
+ if variant_array.is_valid(i) {
+ assert_eq!(variant_array.value(i),
parquet_variant::Variant::Null);
+ }
+ }
}
#[test]
@@ -489,4 +516,85 @@ mod test {
fn make_binary_array() -> ArrayRef {
Arc::new(BinaryArray::from(vec![b"test" as &[u8]]))
}
+
+ #[test]
+ fn all_null_shredding_state() {
+ let metadata = BinaryViewArray::from(vec![b"test" as &[u8]]);
+ let shredding_state = ShreddingState::try_new(metadata.clone(), None,
None).unwrap();
+
+ assert!(matches!(shredding_state, ShreddingState::AllNull { .. }));
+
+ // Verify metadata is preserved correctly
+ if let ShreddingState::AllNull { metadata: m } = shredding_state {
+ assert_eq!(m.len(), metadata.len());
+ assert_eq!(m.value(0), metadata.value(0));
+ }
+ }
+
+ #[test]
+ fn all_null_variant_array_construction() {
+ let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
+ let nulls = NullBuffer::from(vec![false, false, false]); // all null
+
+ let fields = Fields::from(vec![Field::new("metadata",
DataType::BinaryView, false)]);
+ let struct_array = StructArray::new(fields, vec![Arc::new(metadata)],
Some(nulls));
+
+ let variant_array =
VariantArray::try_new(Arc::new(struct_array)).unwrap();
+
+ // Verify the shredding state is AllNull
+ assert!(matches!(
+ variant_array.shredding_state(),
+ ShreddingState::AllNull { .. }
+ ));
+
+ // Verify all values are null
+ assert_eq!(variant_array.len(), 3);
+ assert!(!variant_array.is_valid(0));
+ assert!(!variant_array.is_valid(1));
+ assert!(!variant_array.is_valid(2));
+
+ // Verify that value() returns Variant::Null for all indices
+ for i in 0..variant_array.len() {
+ assert!(
+ !variant_array.is_valid(i),
+ "Expected value at index {i} to be null"
+ );
+ }
+ }
+
+ #[test]
+ fn value_field_present_but_all_null_should_be_unshredded() {
+ // This test demonstrates the issue: when a value field exists in
schema
+ // but all its values are null, it should remain Unshredded, not
AllNull
+ let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
+
+ // Create a value field with all null values
+ let value_nulls = NullBuffer::from(vec![false, false, false]); // all
null
+ let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
+ let value_data = value_array
+ .to_data()
+ .into_builder()
+ .nulls(Some(value_nulls))
+ .build()
+ .unwrap();
+ let value = BinaryViewArray::from(value_data);
+
+ let fields = Fields::from(vec![
+ Field::new("metadata", DataType::BinaryView, false),
+ Field::new("value", DataType::BinaryView, true), // Field exists
in schema
+ ]);
+ let struct_array = StructArray::new(
+ fields,
+ vec![Arc::new(metadata), Arc::new(value)],
+ None, // struct itself is not null, just the value field is all
null
+ );
+
+ let variant_array =
VariantArray::try_new(Arc::new(struct_array)).unwrap();
+
+ // This should be Unshredded, not AllNull, because value field exists
in schema
+ assert!(matches!(
+ variant_array.shredding_state(),
+ ShreddingState::Unshredded { .. }
+ ));
+ }
}
diff --git a/parquet-variant-compute/src/variant_get/mod.rs
b/parquet-variant-compute/src/variant_get/mod.rs
index 6812a17483..0c9d2686c0 100644
--- a/parquet-variant-compute/src/variant_get/mod.rs
+++ b/parquet-variant-compute/src/variant_get/mod.rs
@@ -58,6 +58,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) ->
Result<ArrayRef> {
ShreddingState::Unshredded { metadata, value } => {
output_builder.unshredded(variant_array, metadata, value)
}
+ ShreddingState::AllNull { metadata } =>
output_builder.all_null(variant_array, metadata),
}
}
@@ -284,6 +285,40 @@ mod test {
assert_eq!(&result, &expected)
}
+ /// AllNull: extract a value as a VariantArray
+ #[test]
+ fn get_variant_all_null_as_variant() {
+ let array = all_null_variant_array();
+ let options = GetOptions::new();
+ let result = variant_get(&array, options).unwrap();
+
+ // expect the result is a VariantArray
+ let result: &VariantArray = result.as_any().downcast_ref().unwrap();
+ assert_eq!(result.len(), 3);
+
+ // All values should be null
+ assert!(!result.is_valid(0));
+ assert!(!result.is_valid(1));
+ assert!(!result.is_valid(2));
+ }
+
+ /// AllNull: extract a value as an Int32Array
+ #[test]
+ fn get_variant_all_null_as_int32() {
+ let array = all_null_variant_array();
+ // specify we want the typed value as Int32
+ let field = Field::new("typed_value", DataType::Int32, true);
+ let options =
GetOptions::new().with_as_type(Some(FieldRef::from(field)));
+ let result = variant_get(&array, options).unwrap();
+
+ let expected: ArrayRef = Arc::new(Int32Array::from(vec![
+ Option::<i32>::None,
+ Option::<i32>::None,
+ Option::<i32>::None,
+ ]));
+ assert_eq!(&result, &expected)
+ }
+
/// Return a VariantArray that represents a perfectly "shredded" variant
/// for the following example (3 Variant::Int32 values):
///
@@ -427,4 +462,42 @@ mod test {
StructArray::new(Fields::from(fields), arrays, nulls)
}
}
+
+ /// Return a VariantArray that represents an "all null" variant
+ /// for the following example (3 null values):
+ ///
+ /// ```text
+ /// null
+ /// null
+ /// null
+ /// ```
+ ///
+ /// The schema of the corresponding `StructArray` would look like this:
+ ///
+ /// ```text
+ /// StructArray {
+ /// metadata: BinaryViewArray,
+ /// }
+ /// ```
+ fn all_null_variant_array() -> ArrayRef {
+ let (metadata, _value) = {
parquet_variant::VariantBuilder::new().finish() };
+
+ let nulls = NullBuffer::from(vec![
+ false, // row 0 is null
+ false, // row 1 is null
+ false, // row 2 is null
+ ]);
+
+ // metadata is the same for all rows (though they're all null)
+ let metadata =
BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3));
+
+ let struct_array = StructArrayBuilder::new()
+ .with_field("metadata", Arc::new(metadata))
+ .with_nulls(nulls)
+ .build();
+
+ Arc::new(
+ VariantArray::try_new(Arc::new(struct_array)).expect("should
create variant array"),
+ )
+ }
}
diff --git a/parquet-variant-compute/src/variant_get/output/mod.rs
b/parquet-variant-compute/src/variant_get/output/mod.rs
index 245d73cce8..52a8f5bc02 100644
--- a/parquet-variant-compute/src/variant_get/output/mod.rs
+++ b/parquet-variant-compute/src/variant_get/output/mod.rs
@@ -58,6 +58,13 @@ pub(crate) trait OutputBuilder {
metadata: &BinaryViewArray,
value_field: &BinaryViewArray,
) -> Result<ArrayRef>;
+
+ /// write out an all-null variant array
+ fn all_null(
+ &self,
+ variant_array: &VariantArray,
+ metadata: &BinaryViewArray,
+ ) -> Result<ArrayRef>;
}
pub(crate) fn instantiate_output_builder<'a>(
diff --git a/parquet-variant-compute/src/variant_get/output/primitive.rs
b/parquet-variant-compute/src/variant_get/output/primitive.rs
index 496d711c10..aabc9827a7 100644
--- a/parquet-variant-compute/src/variant_get/output/primitive.rs
+++ b/parquet-variant-compute/src/variant_get/output/primitive.rs
@@ -20,8 +20,8 @@ use crate::VariantArray;
use arrow::error::Result;
use arrow::array::{
- Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray,
NullBufferBuilder,
- PrimitiveArray,
+ new_null_array, Array, ArrayRef, ArrowPrimitiveType, AsArray,
BinaryViewArray,
+ NullBufferBuilder, PrimitiveArray,
};
use arrow::compute::{cast_with_options, CastOptions};
use arrow::datatypes::Int32Type;
@@ -157,6 +157,18 @@ impl<T: ArrowPrimitiveVariant> OutputBuilder for
PrimitiveOutputBuilder<'_, T> {
"variant_get unshredded to primitive types is not implemented yet",
)))
}
+
+ fn all_null(
+ &self,
+ variant_array: &VariantArray,
+ _metadata: &BinaryViewArray,
+ ) -> Result<ArrayRef> {
+ // For all-null case, create a primitive array with all null values
+ Ok(Arc::new(new_null_array(
+ self.as_type.data_type(),
+ variant_array.len(),
+ )))
+ }
}
impl ArrowPrimitiveVariant for Int32Type {
diff --git a/parquet-variant-compute/src/variant_get/output/variant.rs
b/parquet-variant-compute/src/variant_get/output/variant.rs
index 6f2f829b66..7c8b4da2f5 100644
--- a/parquet-variant-compute/src/variant_get/output/variant.rs
+++ b/parquet-variant-compute/src/variant_get/output/variant.rs
@@ -145,4 +145,17 @@ impl OutputBuilder for VariantOutputBuilder<'_> {
Ok(Arc::new(builder.build()))
}
+
+ fn all_null(
+ &self,
+ variant_array: &VariantArray,
+ _metadata: &BinaryViewArray,
+ ) -> arrow::error::Result<ArrayRef> {
+ // For all-null case, simply create a VariantArray with all null values
+ let mut builder = VariantArrayBuilder::new(variant_array.len());
+ for _i in 0..variant_array.len() {
+ builder.append_null();
+ }
+ Ok(Arc::new(builder.build()))
+ }
}