This is an automated email from the ASF dual-hosted git repository.
Jefffrey pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 2eeb805b8a Implement AnyRee (#9959)
2eeb805b8a is described below
commit 2eeb805b8a8b8ca67788917ec2f5220eb3e6f958
Author: RIchard Baah <[email protected]>
AuthorDate: Wed May 27 03:39:21 2026 -0400
Implement AnyRee (#9959)
# Which issue does this PR close?
closes #9909.
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- Closes #9909.
# Rationale for this change
makes the API simpler to work with & less code duplication
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
# What changes are included in this PR?
Replace the per-key-type RunEndEncoded match arms in length/bit_length
(arrow-string) and date_part (arrow-arith) with a single dispatch
through the new `AsArray::as_any_ree_opt/as_any_ree` returning &dyn
AnyRunEndArray, mirroring the existing dictionary handling. This removes
the
now-unused `ree_map!` macro, leaving one trait-object code path for all
Int16/Int32/Int64 run-end types.
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
# Are these changes tested?
yes
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
If this PR claims a performance improvement, please include evidence
such as benchmark results.
-->
# Are there any user-facing changes?
no
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
---
arrow-arith/src/temporal.rs | 16 ++++-------
arrow-array/src/array/run_array.rs | 57 +++++++++++++++++++++++++++++++++++---
arrow-array/src/cast.rs | 20 +++++++++++++
arrow-string/src/length.rs | 27 ++++++------------
4 files changed, 87 insertions(+), 33 deletions(-)
diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs
index 301ad172da..769d309f58 100644
--- a/arrow-arith/src/temporal.rs
+++ b/arrow-arith/src/temporal.rs
@@ -24,7 +24,6 @@ use arrow_array::cast::AsArray;
use cast::as_primitive_array;
use chrono::{Datelike, TimeZone, Timelike, Utc};
-use arrow_array::ree_map;
use arrow_array::temporal_conversions::{
MICROSECONDS, MICROSECONDS_IN_DAY, MILLISECONDS, MILLISECONDS_IN_DAY,
NANOSECONDS,
NANOSECONDS_IN_DAY, SECONDS_IN_DAY, date32_to_datetime, date64_to_datetime,
@@ -253,15 +252,12 @@ pub fn date_part(array: &dyn Array, part: DatePart) ->
Result<ArrayRef, ArrowErr
let new_array = array.with_values(values);
Ok(new_array)
}
- DataType::RunEndEncoded(k, _) => match k.data_type() {
- DataType::Int16 => ree_map!(array, Int16Type, |a| date_part(a,
part)),
- DataType::Int32 => ree_map!(array, Int32Type, |a| date_part(a,
part)),
- DataType::Int64 => ree_map!(array, Int64Type, |a| date_part(a,
part)),
- _ => Err(ArrowError::InvalidArgumentError(format!(
- "Invalid run-end type: {:?}",
- k.data_type()
- ))),
- },
+ DataType::RunEndEncoded(_, _) => {
+ let array = array.as_any_ree();
+ let values = date_part(array.values(), part)?;
+ let new_array = array.with_values(values);
+ Ok(new_array)
+ }
t => return_compute_error_with!(format!("{part} does not support"), t),
)
}
diff --git a/arrow-array/src/array/run_array.rs
b/arrow-array/src/array/run_array.rs
index 02bc730b32..09fb2998a2 100644
--- a/arrow-array/src/array/run_array.rs
+++ b/arrow-array/src/array/run_array.rs
@@ -245,13 +245,20 @@ impl<R: RunEndIndexType> RunArray<R> {
/// assert_eq!(new_run_array.run_ends().values(), &[2, 3, 5]);
/// ```
pub fn with_values(&self, values: ArrayRef) -> Self {
- assert_eq!(values.len(), self.values().len());
+ assert_eq!(values.len(), self.values.len());
let (run_ends_field, values_field) = match &self.data_type {
- DataType::RunEndEncoded(r, v) => (r, v),
+ DataType::RunEndEncoded(r, v) => {
+ let new_v = Arc::new(Field::new(
+ v.name(),
+ values.data_type().clone(),
+ v.is_nullable(),
+ ));
+ (r, new_v)
+ }
_ => unreachable!("RunArray should have type RunEndEncoded"),
};
- let data_type =
- DataType::RunEndEncoded(Arc::clone(run_ends_field),
Arc::clone(values_field));
+ let data_type = DataType::RunEndEncoded(Arc::clone(run_ends_field),
values_field);
+
Self {
data_type,
run_ends: self.run_ends.clone(),
@@ -781,6 +788,28 @@ where
RunArrayIter::new(self)
}
}
+/// An array that can be downcast to a [`RunArray`] of any run end type and
any value type.
+///
+/// This can be used to efficiently implement kernels for all possible run end
+/// types without needing to create specialized implementations for each key
type.
+pub trait AnyRunEndArray: Array {
+ /// Returns the values of this array.
+ fn values(&self) -> &Arc<dyn Array>;
+
+ /// Returns a new run-end encoded array with the given values, preserving
the
+ /// existing run ends.
+ fn with_values(&self, values: ArrayRef) -> ArrayRef;
+}
+
+impl<R: RunEndIndexType> AnyRunEndArray for RunArray<R> {
+ fn values(&self) -> &Arc<dyn Array> {
+ &self.values
+ }
+
+ fn with_values(&self, values: ArrayRef) -> ArrayRef {
+ Arc::new(RunArray::<R>::with_values(self, values))
+ }
+}
#[cfg(test)]
mod tests {
@@ -789,6 +818,7 @@ mod tests {
use rand::seq::SliceRandom;
use super::*;
+ use crate::Int64Array;
use crate::builder::PrimitiveRunBuilder;
use crate::cast::AsArray;
use crate::new_empty_array;
@@ -1055,6 +1085,25 @@ mod tests {
let expected = ArrowError::InvalidArgumentError("The run_ends array
length should be the same as values array length. Run_ends array length is 3,
values array length is 4".to_string());
assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
}
+ #[test]
+ fn test_run_array_with_values_changes_value_type() {
+ let values = StringArray::from(vec!["foo", "bar", "baz"]);
+ let run_ends: Int32Array = [Some(1), Some(2),
Some(3)].into_iter().collect();
+ let ree = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
+
+ let new_values = Int64Array::from(vec![10, 20, 30]);
+ let result = ree.with_values(Arc::new(new_values));
+
+ match result.data_type() {
+ DataType::RunEndEncoded(_, v) => {
+ assert_eq!(v.data_type(), &DataType::Int64);
+ }
+ other => panic!("expected RunEndEncoded, got {other:?}"),
+ }
+
+ assert_eq!(result.values().data_type(), &DataType::Int64);
+ assert_eq!(result.values().len(), 3);
+ }
#[test]
fn test_run_array_run_ends_with_null() {
diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs
index d6cc242e02..d30de5906f 100644
--- a/arrow-array/src/cast.rs
+++ b/arrow-array/src/cast.rs
@@ -986,6 +986,14 @@ pub trait AsArray: private::Sealed {
fn as_any_dictionary(&self) -> &dyn AnyDictionaryArray {
self.as_any_dictionary_opt().expect("any dictionary array")
}
+
+ /// Downcasts this to a [`AnyRunEndArray`] returning `None` if not possible
+ fn as_any_ree_opt(&self) -> Option<&dyn AnyRunEndArray>;
+
+ /// Downcasts this to a [`AnyRunEndArray`] panicking if not possible
+ fn as_any_ree(&self) -> &dyn AnyRunEndArray {
+ self.as_any_ree_opt().expect("any run end array")
+ }
}
impl private::Sealed for dyn Array + '_ {}
@@ -1049,6 +1057,14 @@ impl AsArray for dyn Array + '_ {
_ => None
}
}
+
+ fn as_any_ree_opt(&self) -> Option<&dyn AnyRunEndArray> {
+ let array = self;
+ downcast_run_array! {
+ array => Some(array),
+ _ => None
+ }
+ }
}
impl private::Sealed for ArrayRef {}
@@ -1105,6 +1121,10 @@ impl AsArray for ArrayRef {
self.as_ref().as_any_dictionary_opt()
}
+ fn as_any_ree_opt(&self) -> Option<&dyn AnyRunEndArray> {
+ self.as_ref().as_any_ree_opt()
+ }
+
fn as_run_opt<K: RunEndIndexType>(&self) -> Option<&RunArray<K>> {
self.as_ref().as_run_opt()
}
diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs
index feefe1247e..99a9bd69a6 100644
--- a/arrow-string/src/length.rs
+++ b/arrow-string/src/length.rs
@@ -17,7 +17,6 @@
//! Defines kernel for length of string arrays and binary arrays
-use arrow_array::ree_map;
use arrow_array::*;
use arrow_array::{cast::AsArray, types::*};
use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer};
@@ -59,6 +58,10 @@ pub fn length(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
let lengths = length(d.values().as_ref())?;
return Ok(d.with_values(lengths));
}
+ if let Some(ree) = array.as_any_ree_opt() {
+ let lengths = length(ree.values())?;
+ return Ok(ree.with_values(lengths));
+ }
match array.data_type() {
DataType::List(_) => {
let list = array.as_list::<i32>();
@@ -117,15 +120,6 @@ pub fn length(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
list.nulls().cloned(),
)?))
}
- DataType::RunEndEncoded(k, _) => match k.data_type() {
- DataType::Int16 => ree_map!(array, Int16Type, length),
- DataType::Int32 => ree_map!(array, Int32Type, length),
- DataType::Int64 => ree_map!(array, Int64Type, length),
- _ => Err(ArrowError::InvalidArgumentError(format!(
- "Invalid run-end type: {:?}",
- k.data_type()
- ))),
- },
other => Err(ArrowError::ComputeError(format!(
"length not supported for {other:?}"
))),
@@ -144,6 +138,10 @@ pub fn bit_length(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
let lengths = bit_length(d.values().as_ref())?;
return Ok(d.with_values(lengths));
}
+ if let Some(ree) = array.as_any_ree_opt() {
+ let lengths = bit_length(ree.values())?;
+ return Ok(ree.with_values(lengths));
+ }
match array.data_type() {
DataType::Utf8 => {
@@ -190,15 +188,6 @@ pub fn bit_length(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
array.nulls().cloned(),
)?))
}
- DataType::RunEndEncoded(k, _) => match k.data_type() {
- DataType::Int16 => ree_map!(array, Int16Type, bit_length),
- DataType::Int32 => ree_map!(array, Int32Type, bit_length),
- DataType::Int64 => ree_map!(array, Int64Type, bit_length),
- _ => Err(ArrowError::InvalidArgumentError(format!(
- "Invalid run-end type: {:?}",
- k.data_type()
- ))),
- },
other => Err(ArrowError::ComputeError(format!(
"bit_length not supported for {other:?}"
))),