This is an automated email from the ASF dual-hosted git repository.

scovich pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new edd2c8eef5 support large string for unshred variant (#9515)
edd2c8eef5 is described below

commit edd2c8eef5a7b702947a25e3223539e3723d5aac
Author: Matthew Kim <[email protected]>
AuthorDate: Mon Mar 9 12:57:17 2026 -0400

    support large string for unshred variant (#9515)
    
    # Which issue does this PR close?
    
    - Closes https://github.com/apache/arrow-rs/issues/9513
    
    # Rationale for this change
    
    `VariantArray::try_new` and `canonicalize_and_verify_data_type` both
    accept `LargeUtf8` as a valid shredded variant type. However
    unshred_variant currently only handles Utf8 for string typed_value
    columns
    
    This means a VariantArray with a LargeUtf8 typed_value column can be
    constructed successfully, but calling unshred_variant on it fails
---
 parquet-variant-compute/src/unshred_variant.rs | 44 ++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/parquet-variant-compute/src/unshred_variant.rs 
b/parquet-variant-compute/src/unshred_variant.rs
index 3600662915..0fba53b315 100644
--- a/parquet-variant-compute/src/unshred_variant.rs
+++ b/parquet-variant-compute/src/unshred_variant.rs
@@ -20,8 +20,8 @@
 use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder};
 use arrow::array::{
     Array, AsArray as _, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, 
FixedSizeListArray,
-    GenericListArray, GenericListViewArray, ListLikeArray, PrimitiveArray, 
StringArray,
-    StructArray,
+    GenericListArray, GenericListViewArray, LargeStringArray, ListLikeArray, 
PrimitiveArray,
+    StringArray, StructArray,
 };
 use arrow::buffer::NullBuffer;
 use arrow::datatypes::{
@@ -105,6 +105,7 @@ enum UnshredVariantRowBuilder<'a> {
     TimestampNanosecond(TimestampUnshredRowBuilder<'a, 
TimestampNanosecondType>),
     PrimitiveBoolean(UnshredPrimitiveRowBuilder<'a, BooleanArray>),
     PrimitiveString(UnshredPrimitiveRowBuilder<'a, StringArray>),
+    PrimitiveLargeString(UnshredPrimitiveRowBuilder<'a, LargeStringArray>),
     PrimitiveBinaryView(UnshredPrimitiveRowBuilder<'a, BinaryViewArray>),
     PrimitiveUuid(UnshredPrimitiveRowBuilder<'a, FixedSizeBinaryArray>),
     List(ListUnshredVariantBuilder<'a, GenericListArray<i32>>),
@@ -146,6 +147,7 @@ impl<'a> UnshredVariantRowBuilder<'a> {
             Self::TimestampNanosecond(b) => b.append_row(builder, metadata, 
index),
             Self::PrimitiveBoolean(b) => b.append_row(builder, metadata, 
index),
             Self::PrimitiveString(b) => b.append_row(builder, metadata, index),
+            Self::PrimitiveLargeString(b) => b.append_row(builder, metadata, 
index),
             Self::PrimitiveBinaryView(b) => b.append_row(builder, metadata, 
index),
             Self::PrimitiveUuid(b) => b.append_row(builder, metadata, index),
             Self::List(b) => b.append_row(builder, metadata, index),
@@ -226,6 +228,7 @@ impl<'a> UnshredVariantRowBuilder<'a> {
             }
             DataType::Boolean => primitive_builder!(PrimitiveBoolean, 
as_boolean),
             DataType::Utf8 => primitive_builder!(PrimitiveString, as_string),
+            DataType::LargeUtf8 => primitive_builder!(PrimitiveLargeString, 
as_string),
             DataType::BinaryView => primitive_builder!(PrimitiveBinaryView, 
as_binary_view),
             DataType::FixedSizeBinary(16) => {
                 primitive_builder!(PrimitiveUuid, as_fixed_size_binary)
@@ -405,6 +408,7 @@ macro_rules! impl_append_to_variant_builder {
 
 impl_append_to_variant_builder!(BooleanArray);
 impl_append_to_variant_builder!(StringArray);
+impl_append_to_variant_builder!(LargeStringArray);
 impl_append_to_variant_builder!(BinaryViewArray);
 impl_append_to_variant_builder!(PrimitiveArray<Int8Type>);
 impl_append_to_variant_builder!(PrimitiveArray<Int16Type>);
@@ -666,3 +670,39 @@ impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a, 
L> {
 
 // TODO: This code is covered by tests in 
`parquet/tests/variant_integration.rs`. Does that suffice?
 // Or do we also need targeted stand-alone unit tests for full coverage?
+
+#[cfg(test)]
+mod tests {
+    use crate::VariantArray;
+    use arrow::array::{BinaryViewArray, LargeStringArray};
+    use parquet_variant::Variant;
+
+    #[test]
+    fn test_unshred_largeutf8_typed_value() {
+        let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00];
+        let metadata =
+            BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]);
+
+        let typed_value: arrow::array::ArrayRef = std::sync::Arc::new(
+            LargeStringArray::from(vec![
+                Some("hello"),
+                Some("middle"),
+                Some("world"),
+            ]),
+        );
+
+        let variant_array = VariantArray::from_parts(
+            metadata,
+            None,
+            Some(typed_value),
+            None,
+        );
+
+        let result = crate::unshred_variant(&variant_array).unwrap();
+
+        assert_eq!(result.len(), 3);
+        assert_eq!(result.value(0), Variant::from("hello"));
+        assert_eq!(result.value(1), Variant::from("middle"));
+        assert_eq!(result.value(2), Variant::from("world"));
+    }
+}

Reply via email to