This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new ebb6ede98b [Variant]: Implement `DataType::RunEndEncoded` support for
`cast_to_variant` kernel (#8174)
ebb6ede98b is described below
commit ebb6ede98b2b4d96a1a4f501a28ab42a3b937f73
Author: Liam Bao <[email protected]>
AuthorDate: Wed Aug 20 15:09:13 2025 -0400
[Variant]: Implement `DataType::RunEndEncoded` support for
`cast_to_variant` kernel (#8174)
# Which issue does this PR close?
- Closes #8064.
# Rationale for this change
# What changes are included in this PR?
Implement `DataType::RunEndEncoded` for `cast_to_variant`
# Are these changes tested?
Yes
# Are there any user-facing changes?
New cast type supported
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet-variant-compute/src/cast_to_variant.rs | 113 +++++++++++++++++++++++--
1 file changed, 106 insertions(+), 7 deletions(-)
diff --git a/parquet-variant-compute/src/cast_to_variant.rs
b/parquet-variant-compute/src/cast_to_variant.rs
index cdafb64b32..43ee8ccb39 100644
--- a/parquet-variant-compute/src/cast_to_variant.rs
+++ b/parquet-variant-compute/src/cast_to_variant.rs
@@ -23,10 +23,11 @@ use arrow::array::{
TimestampSecondArray,
};
use arrow::datatypes::{
- i256, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type,
Decimal256Type,
- Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type,
Int16Type, Int32Type,
- Int64Type, Int8Type, LargeBinaryType, Time32MillisecondType,
Time32SecondType,
- Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type,
UInt64Type, UInt8Type,
+ i256, ArrowNativeType, BinaryType, BinaryViewType, Date32Type, Date64Type,
Decimal128Type,
+ Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type,
Float64Type, Int16Type,
+ Int32Type, Int64Type, Int8Type, LargeBinaryType, RunEndIndexType,
Time32MillisecondType,
+ Time32SecondType, Time64MicrosecondType, Time64NanosecondType, UInt16Type,
UInt32Type,
+ UInt64Type, UInt8Type,
};
use arrow::temporal_conversions::{
timestamp_ms_to_datetime, timestamp_ns_to_datetime,
timestamp_s_to_datetime,
@@ -502,6 +503,17 @@ pub fn cast_to_variant(input: &dyn Array) ->
Result<VariantArray, ArrowError> {
builder
);
}
+ DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() {
+ DataType::Int16 => process_run_end_encoded::<Int16Type>(input,
&mut builder)?,
+ DataType::Int32 => process_run_end_encoded::<Int32Type>(input,
&mut builder)?,
+ DataType::Int64 => process_run_end_encoded::<Int64Type>(input,
&mut builder)?,
+ _ => {
+ return Err(ArrowError::CastError(format!(
+ "Unsupported run ends type: {:?}",
+ run_ends.data_type()
+ )));
+ }
+ },
DataType::Dictionary(_, _) => {
let dict_array = input.as_any_dictionary();
let values_variant_array =
cast_to_variant(dict_array.values().as_ref())?;
@@ -532,6 +544,41 @@ pub fn cast_to_variant(input: &dyn Array) ->
Result<VariantArray, ArrowError> {
Ok(builder.build())
}
+/// Generic function to process run-end encoded arrays
+fn process_run_end_encoded<R: RunEndIndexType>(
+ input: &dyn Array,
+ builder: &mut VariantArrayBuilder,
+) -> Result<(), ArrowError> {
+ let run_array = input.as_run::<R>();
+ let values_variant_array = cast_to_variant(run_array.values().as_ref())?;
+
+ // Process runs in batches for better performance
+ let run_ends = run_array.run_ends().values();
+ let mut logical_start = 0;
+
+ for (physical_idx, &run_end) in run_ends.iter().enumerate() {
+ let logical_end = run_end.as_usize();
+ let run_length = logical_end - logical_start;
+
+ if values_variant_array.is_null(physical_idx) {
+ // Append nulls for the entire run
+ for _ in 0..run_length {
+ builder.append_null();
+ }
+ } else {
+ // Get the value once and append it for the entire run
+ let value = values_variant_array.value(physical_idx);
+ for _ in 0..run_length {
+ builder.append_variant(value.clone());
+ }
+ }
+
+ logical_start = logical_end;
+ }
+
+ Ok(())
+}
+
// TODO do we need a cast_with_options to allow specifying conversion behavior,
// e.g. how to handle overflows, whether to convert to Variant::Null or return
// an error, etc. ?
@@ -544,9 +591,9 @@ mod tests {
Decimal256Array, Decimal32Array, Decimal64Array, DictionaryArray,
FixedSizeBinaryBuilder,
Float16Array, Float32Array, Float64Array, GenericByteBuilder,
GenericByteViewBuilder,
Int16Array, Int32Array, Int64Array, Int8Array, IntervalYearMonthArray,
LargeStringArray,
- NullArray, StringArray, StringViewArray, StructArray,
Time32MillisecondArray,
- Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
UInt16Array, UInt32Array,
- UInt64Array, UInt8Array,
+ NullArray, StringArray, StringRunBuilder, StringViewArray, StructArray,
+ Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray,
Time64NanosecondArray,
+ UInt16Array, UInt32Array, UInt64Array, UInt8Array,
};
use arrow::buffer::NullBuffer;
use arrow_schema::{Field, Fields};
@@ -1847,6 +1894,58 @@ mod tests {
);
}
+ #[test]
+ fn test_cast_to_variant_run_end_encoded() {
+ let mut builder = StringRunBuilder::<Int32Type>::new();
+ builder.append_value("apple");
+ builder.append_value("apple");
+ builder.append_value("banana");
+ builder.append_value("banana");
+ builder.append_value("banana");
+ builder.append_value("cherry");
+ let run_array = builder.finish();
+
+ run_test(
+ Arc::new(run_array),
+ vec![
+ Some(Variant::from("apple")),
+ Some(Variant::from("apple")),
+ Some(Variant::from("banana")),
+ Some(Variant::from("banana")),
+ Some(Variant::from("banana")),
+ Some(Variant::from("cherry")),
+ ],
+ );
+ }
+
+ #[test]
+ fn test_cast_to_variant_run_end_encoded_with_nulls() {
+ use arrow::array::StringRunBuilder;
+ use arrow::datatypes::Int32Type;
+
+ // Test run-end encoded array with nulls
+ let mut builder = StringRunBuilder::<Int32Type>::new();
+ builder.append_value("apple");
+ builder.append_null();
+ builder.append_value("banana");
+ builder.append_value("banana");
+ builder.append_null();
+ builder.append_null();
+ let run_array = builder.finish();
+
+ run_test(
+ Arc::new(run_array),
+ vec![
+ Some(Variant::from("apple")),
+ None,
+ Some(Variant::from("banana")),
+ Some(Variant::from("banana")),
+ None,
+ None,
+ ],
+ );
+ }
+
#[test]
fn test_cast_to_variant_dictionary() {
let values = StringArray::from(vec!["apple", "banana", "cherry",
"date"]);