This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 1ab356f850 [Variant] Add variant to arrow for 
`DataType::{Binary/LargeBinary/BinaryView}` (#8768)
1ab356f850 is described below

commit 1ab356f850155469bf028f070bb42a2989d5daff
Author: Congxian Qiu <[email protected]>
AuthorDate: Sun Nov 9 20:58:49 2025 +0800

    [Variant] Add variant to arrow for 
`DataType::{Binary/LargeBinary/BinaryView}` (#8768)
    
    # Which issue does this PR close?
    
    We generally require a GitHub issue to be filed for all bug fixes and
    enhancements and this helps us generate change logs for our releases.
    You can link an issue to this PR using the GitHub syntax.
    
    - Closes #8767.
    
    
    # What changes are included in this PR?
    
    - Add a struct `VariantToBinaryRowBuilder<'a>`, and
    `BinaryLikeArrayBuilder`
    - Add three enums `Binary(VariantToBinaryArrowRowBuilder<'a,
    BinaryBuilder>)` , `LargeBinary(VariantToBinaryArrowRowBuilder<'a,
    LargeBinaryBuilder>)` and `BinaryView(VariantToBinaryArrowRowBuilder<'a,
    BinaryViewBuilder>)` for `PrimitiveVariantToArrowRowBuilder`
    - Add tests to cover the added logic
    
    # Are these changes tested?
    
    Added new tests
    
    # Are there any user-facing changes?
    
    No public API changed
---
 arrow-array/src/builder/generic_bytes_builder.rs   | 46 ++++++++++++++-
 .../src/builder/generic_bytes_view_builder.rs      | 17 +++++-
 parquet-variant-compute/src/variant_array.rs       |  6 +-
 parquet-variant-compute/src/variant_get.rs         | 66 ++++++++++++++++++++--
 parquet-variant-compute/src/variant_to_arrow.rs    | 32 ++++++++++-
 5 files changed, 155 insertions(+), 12 deletions(-)

diff --git a/arrow-array/src/builder/generic_bytes_builder.rs 
b/arrow-array/src/builder/generic_bytes_builder.rs
index f743b31916..7ed4bc5826 100644
--- a/arrow-array/src/builder/generic_bytes_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_builder.rs
@@ -357,7 +357,7 @@ impl<O: OffsetSizeTrait> std::fmt::Write for 
GenericStringBuilder<O> {
 /// We will use the `AVERAGE_STRING_LENGTH` * row_count for `data_capacity`. \
 ///
 /// These capacities are preallocation hints used to improve performance,
-/// but consuquences of passing a hint too large or too small should be 
negligible.
+/// but consequences of passing a hint too large or too small should be 
negligible.
 const AVERAGE_STRING_LENGTH: usize = 16;
 /// Trait for string-like array builders
 ///
@@ -392,6 +392,50 @@ impl<O: OffsetSizeTrait> StringLikeArrayBuilder for 
GenericStringBuilder<O> {
     }
 }
 
+/// A byte size value representing the number of bytes to allocate per binary 
in [`GenericBinaryBuilder`]
+///
+/// To create a [`GenericBinaryBuilder`] using `.with_capacity` we are 
required to provide: \
+/// - `item_capacity` - the row count \
+/// - `data_capacity` - total binary byte count \
+///
+/// We will use the `AVERAGE_BINARY_LENGTH` * row_count for `data_capacity`. \
+///
+/// These capacities are preallocation hints used to improve performance,
+/// but consequences of passing a hint too large or too small should be 
negligible.
+const AVERAGE_BINARY_LENGTH: usize = 128;
+/// Trait for binary-like array builders
+///
+/// This trait provides unified interface for builders that append binary-like 
data
+/// such as [`GenericBinaryBuilder<O>`] and 
[`crate::builder::BinaryViewBuilder`]
+pub trait BinaryLikeArrayBuilder: ArrayBuilder {
+    /// Returns a human-readable type name for the builder.
+    fn type_name() -> &'static str;
+
+    /// Creates a new builder with the given row capacity.
+    fn with_capacity(capacity: usize) -> Self;
+
+    /// Appends a non-null string value to the builder.
+    fn append_value(&mut self, value: &[u8]);
+
+    /// Appends a null value to the builder.
+    fn append_null(&mut self);
+}
+
+impl<O: OffsetSizeTrait> BinaryLikeArrayBuilder for GenericBinaryBuilder<O> {
+    fn type_name() -> &'static str {
+        std::any::type_name::<Self>()
+    }
+    fn with_capacity(capacity: usize) -> Self {
+        Self::with_capacity(capacity, capacity * AVERAGE_BINARY_LENGTH)
+    }
+    fn append_value(&mut self, value: &[u8]) {
+        Self::append_value(self, value);
+    }
+    fn append_null(&mut self) {
+        Self::append_null(self);
+    }
+}
+
 ///  Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray]
 ///
 /// Values can be appended using [`GenericByteBuilder::append_value`], and 
nulls with
diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs 
b/arrow-array/src/builder/generic_bytes_view_builder.rs
index 7e7a561a8c..35f5684cc7 100644
--- a/arrow-array/src/builder/generic_bytes_view_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_view_builder.rs
@@ -25,7 +25,7 @@ use arrow_schema::ArrowError;
 use hashbrown::HashTable;
 use hashbrown::hash_table::Entry;
 
-use crate::builder::{ArrayBuilder, StringLikeArrayBuilder};
+use crate::builder::{ArrayBuilder, BinaryLikeArrayBuilder, 
StringLikeArrayBuilder};
 use crate::types::bytes::ByteArrayNativeType;
 use crate::types::{BinaryViewType, ByteViewType, StringViewType};
 use crate::{Array, ArrayRef, GenericByteViewArray};
@@ -570,6 +570,21 @@ impl StringLikeArrayBuilder for StringViewBuilder {
 ///
 pub type BinaryViewBuilder = GenericByteViewBuilder<BinaryViewType>;
 
+impl BinaryLikeArrayBuilder for BinaryViewBuilder {
+    fn type_name() -> &'static str {
+        std::any::type_name::<BinaryViewBuilder>()
+    }
+    fn with_capacity(capacity: usize) -> Self {
+        Self::with_capacity(capacity)
+    }
+    fn append_value(&mut self, value: &[u8]) {
+        Self::append_value(self, value);
+    }
+    fn append_null(&mut self) {
+        Self::append_null(self);
+    }
+}
+
 /// Creates a view from a fixed length input (the compiler can generate
 /// specialized code for this)
 fn make_inlined_view<const LEN: usize>(data: &[u8]) -> u128 {
diff --git a/parquet-variant-compute/src/variant_array.rs 
b/parquet-variant-compute/src/variant_array.rs
index ba88c45bab..fb2a08d641 100644
--- a/parquet-variant-compute/src/variant_array.rs
+++ b/parquet-variant-compute/src/variant_array.rs
@@ -1172,9 +1172,9 @@ fn canonicalize_and_verify_data_type(data_type: 
&DataType) -> Result<Cow<'_, Dat
         Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
         Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
 
-        // Binary and string are allowed. Force Binary to BinaryView because 
that's what the parquet
+        // Binary and string are allowed. Force Binary/LargeBinary to 
BinaryView because that's what the parquet
         // reader returns and what the rest of the variant code expects.
-        Binary => Cow::Owned(DataType::BinaryView),
+        Binary | LargeBinary => Cow::Owned(BinaryView),
         BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(),
 
         // UUID maps to 16-byte fixed-size binary; no other width is allowed
@@ -1182,7 +1182,7 @@ fn canonicalize_and_verify_data_type(data_type: 
&DataType) -> Result<Cow<'_, Dat
         FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
 
         // We can _possibly_ allow (some of) these some day?
-        LargeBinary | ListView(_) | LargeList(_) | LargeListView(_) => {
+        ListView(_) | LargeList(_) | LargeListView(_) => {
             fail!()
         }
 
diff --git a/parquet-variant-compute/src/variant_get.rs 
b/parquet-variant-compute/src/variant_get.rs
index 59fdb6d31f..5edcb74a29 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -309,10 +309,11 @@ mod test {
     use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
     use crate::{VariantArray, VariantArrayBuilder, json_to_variant};
     use arrow::array::{
-        Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, 
Decimal32Array,
-        Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, 
Float64Array, Int8Array,
-        Int16Array, Int32Array, Int64Array, LargeStringArray, NullBuilder, 
StringArray,
-        StringViewArray, StructArray, Time64MicrosecondArray,
+        Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, 
Date32Array,
+        Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, 
Float32Array,
+        Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, 
LargeBinaryArray,
+        LargeStringArray, NullBuilder, StringArray, StringViewArray, 
StructArray,
+        Time64MicrosecondArray,
     };
     use arrow::buffer::NullBuffer;
     use arrow::compute::CastOptions;
@@ -1316,6 +1317,63 @@ mod test {
         )
     }
 
+    
perfectly_shredded_variant_array_fn!(perfectly_shredded_binary_variant_array, 
|| {
+        BinaryArray::from(vec![
+            Some(b"Apache" as &[u8]),
+            Some(b"Arrow-rs" as &[u8]),
+            Some(b"Parquet-variant" as &[u8]),
+        ])
+    });
+
+    perfectly_shredded_to_arrow_primitive_test!(
+        get_variant_perfectly_shredded_binary_as_binary,
+        DataType::Binary,
+        perfectly_shredded_binary_variant_array,
+        BinaryArray::from(vec![
+            Some(b"Apache" as &[u8]),
+            Some(b"Arrow-rs" as &[u8]),
+            Some(b"Parquet-variant" as &[u8]),
+        ])
+    );
+
+    
perfectly_shredded_variant_array_fn!(perfectly_shredded_large_binary_variant_array,
 || {
+        LargeBinaryArray::from(vec![
+            Some(b"Apache" as &[u8]),
+            Some(b"Arrow-rs" as &[u8]),
+            Some(b"Parquet-variant" as &[u8]),
+        ])
+    });
+
+    perfectly_shredded_to_arrow_primitive_test!(
+        get_variant_perfectly_shredded_large_binary_as_large_binary,
+        DataType::LargeBinary,
+        perfectly_shredded_large_binary_variant_array,
+        LargeBinaryArray::from(vec![
+            Some(b"Apache" as &[u8]),
+            Some(b"Arrow-rs" as &[u8]),
+            Some(b"Parquet-variant" as &[u8]),
+        ])
+    );
+
+    
perfectly_shredded_variant_array_fn!(perfectly_shredded_binary_view_variant_array,
 || {
+        BinaryViewArray::from(vec![
+            Some(b"Apache" as &[u8]),
+            Some(b"Arrow-rs" as &[u8]),
+            Some(b"Parquet-variant" as &[u8]),
+        ])
+    });
+
+    perfectly_shredded_to_arrow_primitive_test!(
+        get_variant_perfectly_shredded_binary_view_as_binary_view,
+        DataType::BinaryView,
+        perfectly_shredded_binary_view_variant_array,
+        BinaryViewArray::from(vec![
+            Some(b"Apache" as &[u8]),
+            Some(b"Arrow-rs" as &[u8]),
+            Some(b"Parquet-variant" as &[u8]),
+        ])
+    );
+
     /// Return a VariantArray that represents a normal "shredded" variant
     /// for the following example
     ///
diff --git a/parquet-variant-compute/src/variant_to_arrow.rs 
b/parquet-variant-compute/src/variant_to_arrow.rs
index b8030bc715..9dbf663566 100644
--- a/parquet-variant-compute/src/variant_to_arrow.rs
+++ b/parquet-variant-compute/src/variant_to_arrow.rs
@@ -16,9 +16,9 @@
 // under the License.
 
 use arrow::array::{
-    ArrayRef, BinaryViewArray, BooleanBuilder, FixedSizeBinaryBuilder, 
LargeStringBuilder,
-    NullArray, NullBufferBuilder, PrimitiveBuilder, StringBuilder, 
StringLikeArrayBuilder,
-    StringViewBuilder,
+    ArrayRef, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewArray, 
BinaryViewBuilder,
+    BooleanBuilder, FixedSizeBinaryBuilder, LargeBinaryBuilder, 
LargeStringBuilder, NullArray,
+    NullBufferBuilder, PrimitiveBuilder, StringBuilder, 
StringLikeArrayBuilder, StringViewBuilder,
 };
 use arrow::compute::{CastOptions, DecimalCast};
 use arrow::datatypes::{self, DataType, DecimalType};
@@ -66,6 +66,9 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> {
     String(VariantToStringArrowBuilder<'a, StringBuilder>),
     LargeString(VariantToStringArrowBuilder<'a, LargeStringBuilder>),
     StringView(VariantToStringArrowBuilder<'a, StringViewBuilder>),
+    Binary(VariantToBinaryArrowRowBuilder<'a, BinaryBuilder>),
+    LargeBinary(VariantToBinaryArrowRowBuilder<'a, LargeBinaryBuilder>),
+    BinaryView(VariantToBinaryArrowRowBuilder<'a, BinaryViewBuilder>),
 }
 
 /// Builder for converting variant values into strongly typed Arrow arrays.
@@ -111,6 +114,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
             String(b) => b.append_null(),
             LargeString(b) => b.append_null(),
             StringView(b) => b.append_null(),
+            Binary(b) => b.append_null(),
+            LargeBinary(b) => b.append_null(),
+            BinaryView(b) => b.append_null(),
         }
     }
 
@@ -144,6 +150,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
             String(b) => b.append_value(value),
             LargeString(b) => b.append_value(value),
             StringView(b) => b.append_value(value),
+            Binary(b) => b.append_value(value),
+            LargeBinary(b) => b.append_value(value),
+            BinaryView(b) => b.append_value(value),
         }
     }
 
@@ -177,6 +186,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
             String(b) => b.finish(),
             LargeString(b) => b.finish(),
             StringView(b) => b.finish(),
+            Binary(b) => b.finish(),
+            LargeBinary(b) => b.finish(),
+            BinaryView(b) => b.finish(),
         }
     }
 }
@@ -322,6 +334,13 @@ pub(crate) fn 
make_primitive_variant_to_arrow_row_builder<'a>(
             LargeString(VariantToStringArrowBuilder::new(cast_options, 
capacity))
         }
         DataType::Utf8View => 
StringView(VariantToStringArrowBuilder::new(cast_options, capacity)),
+        DataType::Binary => 
Binary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity)),
+        DataType::LargeBinary => {
+            LargeBinary(VariantToBinaryArrowRowBuilder::new(cast_options, 
capacity))
+        }
+        DataType::BinaryView => {
+            BinaryView(VariantToBinaryArrowRowBuilder::new(cast_options, 
capacity))
+        }
         _ if data_type.is_primitive() => {
             return Err(ArrowError::NotYetImplemented(format!(
                 "Primitive data_type {data_type:?} not yet implemented"
@@ -506,6 +525,13 @@ define_variant_to_primitive_builder!(
     type_name: T::DATA_TYPE
 );
 
+define_variant_to_primitive_builder!(
+    struct VariantToBinaryArrowRowBuilder<'a, B: BinaryLikeArrayBuilder>
+    |capacity| -> B { B::with_capacity(capacity) },
+    |value| value.as_u8_slice(),
+    type_name: B::type_name()
+);
+
 /// Builder for converting variant values to arrow Decimal values
 pub(crate) struct VariantToDecimalArrowRowBuilder<'a, T>
 where

Reply via email to