This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 481223f875 feat(parquet-variant): add Dictionary and REE 
variant_to_arrow support (#10014)
481223f875 is described below

commit 481223f8750957d47d34990fe1ae2a1a5aa0515a
Author: Neetika Mittal <[email protected]>
AuthorDate: Tue Jun 9 18:50:21 2026 +0100

    feat(parquet-variant): add Dictionary and REE variant_to_arrow support 
(#10014)
    
    # Which issue does this PR close?
    
    - Closes #10013
    - Related to #6736
    
    # Rationale for this change
    
    `variant_get` / `variant_to_arrow` can already convert Variant values
    into many native Arrow array layouts, but requesting
    `DataType::Dictionary` or `DataType::RunEndEncoded` was not supported.
    
    This PR adds support for those output encodings without changing Variant
    shredding semantics. `Dictionary` and `RunEndEncoded` are produced as
    Arrow result arrays only; they are not introduced as valid Parquet
    Variant shredded `typed_value` layouts.
    
    # What changes are included in this PR?
    
    1. Adds an encoded output builder in `variant_to_arrow` for
    `DataType::Dictionary` and `DataType::RunEndEncoded`.
    2. Builds the logical child value array using the existing
    Variant-to-Arrow builders, then delegates the final Dictionary/REE
    encoding to Arrow's existing cast kernels.
    3. Adds `variant_get` regression coverage for string dictionary, numeric
    dictionary, and run-end encoded outputs.
    
    # Are these changes tested?
    
    Yes:
    
    - `cargo fmt --check`
    - `cargo test -p parquet-variant-compute`
    - `cargo test -p parquet-variant`
    - `cargo clippy --workspace --all-targets`
    
    # Are there any user-facing changes?
    
    Yes. `variant_get` with `as_type` set to `DataType::Dictionary` or
    `DataType::RunEndEncoded` can now return those Arrow array encodings.
    
    Co-authored-by: Neetika Mittal <[email protected]>
---
 parquet-variant-compute/src/variant_get.rs      | 80 ++++++++++++++++++++++++-
 parquet-variant-compute/src/variant_to_arrow.rs | 63 ++++++++++++++++++-
 2 files changed, 141 insertions(+), 2 deletions(-)

diff --git a/parquet-variant-compute/src/variant_get.rs 
b/parquet-variant-compute/src/variant_get.rs
index 38c577564d..c3e9159935 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -448,7 +448,7 @@ mod test {
         Time64NanosecondArray,
     };
     use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
-    use arrow::compute::CastOptions;
+    use arrow::compute::{CastOptions, cast};
     use arrow::datatypes::DataType::{Int16, Int32, Int64};
     use arrow::datatypes::i256;
     use arrow::util::display::FormatOptions;
@@ -4223,6 +4223,84 @@ mod test {
         }
     }
 
+    #[test]
+    fn get_variant_as_dictionary() {
+        let variant_array: ArrayRef = 
ArrayRef::from(VariantArray::from_iter(vec![
+            Some(Variant::from("apple")),
+            Some(Variant::from("banana")),
+            None,
+            Some(Variant::from("apple")),
+        ]));
+        let data_type = DataType::Dictionary(Box::new(DataType::Int32), 
Box::new(DataType::Utf8));
+        let options = 
GetOptions::new().with_as_type(Some(FieldRef::from(Field::new(
+            "dict",
+            data_type.clone(),
+            true,
+        ))));
+
+        let result = variant_get(&variant_array, options).unwrap();
+        assert_eq!(result.data_type(), &data_type);
+
+        let decoded = cast(result.as_ref(), &DataType::Utf8).unwrap();
+        let expected = StringArray::from(vec![Some("apple"), Some("banana"), 
None, Some("apple")]);
+        assert_eq!(decoded.as_ref(), &expected);
+    }
+
+    #[test]
+    fn get_variant_as_numeric_dictionary() {
+        let variant_array: ArrayRef = 
ArrayRef::from(VariantArray::from_iter(vec![
+            Some(Variant::from(42)),
+            Some(Variant::from(7)),
+            None,
+            Some(Variant::from(42)),
+        ]));
+        let data_type = DataType::Dictionary(Box::new(DataType::Int16), 
Box::new(DataType::Int32));
+        let options = 
GetOptions::new().with_as_type(Some(FieldRef::from(Field::new(
+            "dict",
+            data_type.clone(),
+            true,
+        ))));
+
+        let result = variant_get(&variant_array, options).unwrap();
+        assert_eq!(result.data_type(), &data_type);
+
+        let decoded = cast(result.as_ref(), &DataType::Int32).unwrap();
+        let expected = Int32Array::from(vec![Some(42), Some(7), None, 
Some(42)]);
+        assert_eq!(decoded.as_ref(), &expected);
+    }
+
+    #[test]
+    fn get_variant_as_run_end_encoded() {
+        let variant_array: ArrayRef = 
ArrayRef::from(VariantArray::from_iter(vec![
+            Some(Variant::from("apple")),
+            Some(Variant::from("apple")),
+            None,
+            Some(Variant::from("banana")),
+            Some(Variant::from("banana")),
+        ]));
+        let run_ends = Arc::new(Field::new("run_ends", DataType::Int32, 
false));
+        let values = Arc::new(Field::new("values", DataType::Utf8, true));
+        let data_type = DataType::RunEndEncoded(run_ends, values);
+        let options = 
GetOptions::new().with_as_type(Some(FieldRef::from(Field::new(
+            "ree",
+            data_type.clone(),
+            true,
+        ))));
+
+        let result = variant_get(&variant_array, options).unwrap();
+        assert_eq!(result.data_type(), &data_type);
+
+        let decoded = cast(result.as_ref(), &DataType::Utf8).unwrap();
+        let expected = StringArray::from(vec![
+            Some("apple"),
+            Some("apple"),
+            None,
+            Some("banana"),
+            Some("banana"),
+        ]);
+        assert_eq!(decoded.as_ref(), &expected);
+    }
+
     fn invalid_time_variant_array() -> ArrayRef {
         let mut builder = VariantArrayBuilder::new(3);
         // 86401000000 is invalid for Time64Microsecond (max is 86400000000)
diff --git a/parquet-variant-compute/src/variant_to_arrow.rs 
b/parquet-variant-compute/src/variant_to_arrow.rs
index ee6f1049ed..9841da555d 100644
--- a/parquet-variant-compute/src/variant_to_arrow.rs
+++ b/parquet-variant-compute/src/variant_to_arrow.rs
@@ -33,7 +33,7 @@ use arrow::array::{
     StructArray,
 };
 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
-use arrow::compute::{CastOptions, DecimalCast};
+use arrow::compute::{CastOptions, DecimalCast, cast_with_options};
 use arrow::datatypes::{self, DataType, DecimalType};
 use arrow::error::{ArrowError, Result};
 use arrow_schema::{FieldRef, Fields, TimeUnit};
@@ -48,6 +48,7 @@ pub(crate) enum VariantToArrowRowBuilder<'a> {
     Primitive(PrimitiveVariantToArrowRowBuilder<'a>),
     Array(ArrayVariantToArrowRowBuilder<'a>),
     Struct(StructVariantToArrowRowBuilder<'a>),
+    Encoded(EncodedVariantToArrowRowBuilder<'a>),
     BinaryVariant(VariantToBinaryVariantArrowRowBuilder),
 
     // Path extraction wrapper - contains a boxed enum for any of the above
@@ -61,6 +62,7 @@ impl<'a> VariantToArrowRowBuilder<'a> {
             Primitive(b) => b.append_null(),
             Array(b) => b.append_null(),
             Struct(b) => b.append_null(),
+            Encoded(b) => b.append_null(),
             BinaryVariant(b) => b.append_null(),
             WithPath(path_builder) => path_builder.append_null(),
         }
@@ -72,6 +74,7 @@ impl<'a> VariantToArrowRowBuilder<'a> {
             Primitive(b) => b.append_value(&value),
             Array(b) => b.append_value(&value),
             Struct(b) => b.append_value(&value),
+            Encoded(b) => b.append_value(value),
             BinaryVariant(b) => b.append_value(value),
             WithPath(path_builder) => path_builder.append_value(value),
         }
@@ -83,6 +86,7 @@ impl<'a> VariantToArrowRowBuilder<'a> {
             Primitive(b) => b.finish(),
             Array(b) => b.finish(),
             Struct(b) => b.finish(),
+            Encoded(b) => b.finish(),
             BinaryVariant(b) => b.finish(),
             WithPath(path_builder) => path_builder.finish(),
         }
@@ -110,6 +114,24 @@ fn make_typed_variant_to_arrow_row_builder<'a>(
                 ArrayVariantToArrowRowBuilder::try_new(data_type, 
cast_options, capacity, false)?;
             Ok(Array(builder))
         }
+        DataType::Dictionary(_, value_type) => {
+            let builder = EncodedVariantToArrowRowBuilder::try_new(
+                data_type,
+                value_type.as_ref(),
+                cast_options,
+                capacity,
+            )?;
+            Ok(Encoded(builder))
+        }
+        DataType::RunEndEncoded(_, value_field) => {
+            let builder = EncodedVariantToArrowRowBuilder::try_new(
+                data_type,
+                value_field.data_type(),
+                cast_options,
+                capacity,
+            )?;
+            Ok(Encoded(builder))
+        }
         data_type => {
             let builder =
                 make_primitive_variant_to_arrow_row_builder(data_type, 
cast_options, capacity)?;
@@ -331,6 +353,45 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
     }
 }
 
+pub(crate) struct EncodedVariantToArrowRowBuilder<'a> {
+    data_type: &'a DataType,
+    cast_options: &'a CastOptions<'a>,
+    values_builder: Box<VariantToArrowRowBuilder<'a>>,
+}
+
+impl<'a> EncodedVariantToArrowRowBuilder<'a> {
+    fn try_new(
+        data_type: &'a DataType,
+        value_type: &'a DataType,
+        cast_options: &'a CastOptions,
+        capacity: usize,
+    ) -> Result<Self> {
+        let values_builder = Box::new(make_typed_variant_to_arrow_row_builder(
+            value_type,
+            cast_options,
+            capacity,
+        )?);
+        Ok(Self {
+            data_type,
+            cast_options,
+            values_builder,
+        })
+    }
+
+    fn append_null(&mut self) -> Result<()> {
+        self.values_builder.append_null()
+    }
+
+    fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
+        self.values_builder.append_value(value)
+    }
+
+    fn finish(self) -> Result<ArrayRef> {
+        let values = self.values_builder.finish()?;
+        cast_with_options(values.as_ref(), self.data_type, self.cast_options)
+    }
+}
+
 /// Creates a row builder that converts primitive `Variant` values into the 
requested Arrow data type.
 pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
     data_type: &'a DataType,

Reply via email to