alamb commented on code in PR #8589:
URL: https://github.com/apache/arrow-rs/pull/8589#discussion_r2456039476
##########
arrow-cast/src/cast/run_array.rs:
##########
@@ -0,0 +1,262 @@
+use crate::cast::*;
+use arrow_ord::partition::partition;
+
+/// Attempts to cast a Run-End Encoded array to another type, handling both
REE-to-REE
+/// and REE-to-other type conversions with proper validation and error
handling.
+///
+/// # Arguments
+/// * `array` - The input Run-End Encoded array to be cast
+/// * `to_type` - The target data type for the casting operation
+/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs
unsafe)
+///
+/// # Returns
+/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting
fails
+///
+/// # Behavior
+/// This function handles two main casting scenarios:
+///
+/// ## Case 1: REE-to-REE Casting
+/// When casting to another Run-End Encoded type:
+/// - Casts both the `values` and `run_ends` to their target types
+/// - Validates that run-end casting only allows upcasts (Int16→Int32,
Int16→Int64, Int32→Int64)
+/// - Preserves the REE structure while updating both fields
+/// - Returns a new `RunArray` with the appropriate run-end type (Int16,
Int32, or Int64)
+///
+/// ## Case 2: REE-to-Other Casting
+/// When casting to a non-REE type:
+/// - Expands the REE array to its logical form by unpacking all values
+/// - Applies the target type casting to the expanded array
+/// - Returns a regular array of the target type (e.g., StringArray,
Int64Array)
+///
+/// # Error Handling, error occurs if:
+/// - the input array is not a Run-End Encoded array
+/// - run-end downcasting would cause overflow
+/// - the target run-end type is unsupported
+/// - Propagates errors from underlying casting operations
+///
+/// # Safety Considerations
+/// - Run-end casting uses `safe: false` to prevent silent overflow
Review Comment:
isn't the safe flag past in?
##########
arrow-cast/src/cast/run_array.rs:
##########
@@ -0,0 +1,262 @@
+use crate::cast::*;
+use arrow_ord::partition::partition;
+
+/// Attempts to cast a Run-End Encoded array to another type, handling both
REE-to-REE
Review Comment:
these comments are bit verbose (LLM generated?) for the amount of value they
add -- and I think read too much like a software add than help understand the
code. Rather than listing all the features in a (private) doc comments, I think
better is to document this with code and inline comments where available
##########
arrow-cast/src/cast/run_array.rs:
##########
@@ -0,0 +1,262 @@
+use crate::cast::*;
+use arrow_ord::partition::partition;
+
+/// Attempts to cast a Run-End Encoded array to another type, handling both
REE-to-REE
+/// and REE-to-other type conversions with proper validation and error
handling.
+///
+/// # Arguments
+/// * `array` - The input Run-End Encoded array to be cast
+/// * `to_type` - The target data type for the casting operation
+/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs
unsafe)
+///
+/// # Returns
+/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting
fails
+///
+/// # Behavior
+/// This function handles two main casting scenarios:
+///
+/// ## Case 1: REE-to-REE Casting
+/// When casting to another Run-End Encoded type:
+/// - Casts both the `values` and `run_ends` to their target types
+/// - Validates that run-end casting only allows upcasts (Int16→Int32,
Int16→Int64, Int32→Int64)
+/// - Preserves the REE structure while updating both fields
+/// - Returns a new `RunArray` with the appropriate run-end type (Int16,
Int32, or Int64)
+///
+/// ## Case 2: REE-to-Other Casting
+/// When casting to a non-REE type:
+/// - Expands the REE array to its logical form by unpacking all values
+/// - Applies the target type casting to the expanded array
+/// - Returns a regular array of the target type (e.g., StringArray,
Int64Array)
+///
+/// # Error Handling, error occurs if:
+/// - the input array is not a Run-End Encoded array
+/// - run-end downcasting would cause overflow
+/// - the target run-end type is unsupported
+/// - Propagates errors from underlying casting operations
+///
+/// # Safety Considerations
+/// - Run-end casting uses `safe: false` to prevent silent overflow
+/// - Only upcasts are allowed for run-ends to maintain valid REE structure
+/// - Unpacking preserves null values and array length
+/// - Type validation ensures only supported run-end types (Int16, Int32,
Int64)
+///
+/// # Performance Notes
+/// - REE-to-REE casting is efficient as it operates on the compressed
structure
+/// - REE-to-other casting requires full unpacking, which may be expensive for
large arrays
+/// - Run-end validation adds minimal overhead for safety
+pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>(
+ array: &dyn Array,
+ to_type: &DataType,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ match array.data_type() {
+ DataType::RunEndEncoded(_, _) => {
+ let run_array = array
+ .as_any()
+ .downcast_ref::<RunArray<K>>()
+ .ok_or_else(|| ArrowError::CastError("Expected
RunArray".to_string()))?;
+
+ let values = run_array.values();
+
+ match to_type {
+ // CASE 1: Stay as RunEndEncoded, cast only the values
+ DataType::RunEndEncoded(target_index_field,
target_value_field) => {
+ let cast_values =
+ cast_with_options(values,
target_value_field.data_type(), cast_options)?;
+
+ let run_ends_array = PrimitiveArray::<K>::from_iter_values(
+ run_array.run_ends().values().iter().copied(),
+ );
+ let cast_run_ends = cast_with_options(
+ &run_ends_array,
+ target_index_field.data_type(),
+ cast_options,
+ )?;
+ let new_run_array: ArrayRef = match
target_index_field.data_type() {
+ DataType::Int16 => {
+ let re = cast_run_ends.as_primitive::<Int16Type>();
+ Arc::new(RunArray::<Int16Type>::try_new(re,
cast_values.as_ref())?)
+ }
+ DataType::Int32 => {
+ let re = cast_run_ends.as_primitive::<Int32Type>();
+ Arc::new(RunArray::<Int32Type>::try_new(re,
cast_values.as_ref())?)
+ }
+ DataType::Int64 => {
+ let re = cast_run_ends.as_primitive::<Int64Type>();
+ Arc::new(RunArray::<Int64Type>::try_new(re,
cast_values.as_ref())?)
+ }
+ _ => {
+ return Err(ArrowError::CastError(
+ "Run-end type must be i16, i32, or
i64".to_string(),
+ ));
+ }
+ };
+ Ok(Arc::new(new_run_array))
+ }
+
+ // CASE 2: Expand to logical form
+ _ => {
+ let total_len = run_array.len();
+ let indices = Int32Array::from_iter_values(
+ (0..total_len).map(|i| run_array.get_physical_index(i)
as i32),
Review Comment:
I suspect this is a pretty slow way to compute the take indicies as it does
a binary search to find the logical index
You can probably compute them more efficiently directly from the run ends,
though this can be done as a follow on PR
##########
arrow-cast/src/cast/run_array.rs:
##########
@@ -0,0 +1,262 @@
+use crate::cast::*;
+use arrow_ord::partition::partition;
+
+/// Attempts to cast a Run-End Encoded array to another type, handling both
REE-to-REE
+/// and REE-to-other type conversions with proper validation and error
handling.
+///
+/// # Arguments
+/// * `array` - The input Run-End Encoded array to be cast
+/// * `to_type` - The target data type for the casting operation
+/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs
unsafe)
+///
+/// # Returns
+/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting
fails
+///
+/// # Behavior
+/// This function handles two main casting scenarios:
+///
+/// ## Case 1: REE-to-REE Casting
+/// When casting to another Run-End Encoded type:
+/// - Casts both the `values` and `run_ends` to their target types
+/// - Validates that run-end casting only allows upcasts (Int16→Int32,
Int16→Int64, Int32→Int64)
+/// - Preserves the REE structure while updating both fields
+/// - Returns a new `RunArray` with the appropriate run-end type (Int16,
Int32, or Int64)
+///
+/// ## Case 2: REE-to-Other Casting
+/// When casting to a non-REE type:
+/// - Expands the REE array to its logical form by unpacking all values
+/// - Applies the target type casting to the expanded array
+/// - Returns a regular array of the target type (e.g., StringArray,
Int64Array)
+///
+/// # Error Handling, error occurs if:
+/// - the input array is not a Run-End Encoded array
+/// - run-end downcasting would cause overflow
+/// - the target run-end type is unsupported
+/// - Propagates errors from underlying casting operations
+///
+/// # Safety Considerations
+/// - Run-end casting uses `safe: false` to prevent silent overflow
+/// - Only upcasts are allowed for run-ends to maintain valid REE structure
+/// - Unpacking preserves null values and array length
+/// - Type validation ensures only supported run-end types (Int16, Int32,
Int64)
+///
+/// # Performance Notes
+/// - REE-to-REE casting is efficient as it operates on the compressed
structure
+/// - REE-to-other casting requires full unpacking, which may be expensive for
large arrays
+/// - Run-end validation adds minimal overhead for safety
+pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>(
+ array: &dyn Array,
+ to_type: &DataType,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ match array.data_type() {
+ DataType::RunEndEncoded(_, _) => {
+ let run_array = array
+ .as_any()
+ .downcast_ref::<RunArray<K>>()
+ .ok_or_else(|| ArrowError::CastError("Expected
RunArray".to_string()))?;
+
+ let values = run_array.values();
+
+ match to_type {
+ // CASE 1: Stay as RunEndEncoded, cast only the values
+ DataType::RunEndEncoded(target_index_field,
target_value_field) => {
+ let cast_values =
+ cast_with_options(values,
target_value_field.data_type(), cast_options)?;
+
+ let run_ends_array = PrimitiveArray::<K>::from_iter_values(
+ run_array.run_ends().values().iter().copied(),
+ );
+ let cast_run_ends = cast_with_options(
+ &run_ends_array,
+ target_index_field.data_type(),
+ cast_options,
+ )?;
+ let new_run_array: ArrayRef = match
target_index_field.data_type() {
+ DataType::Int16 => {
+ let re = cast_run_ends.as_primitive::<Int16Type>();
+ Arc::new(RunArray::<Int16Type>::try_new(re,
cast_values.as_ref())?)
+ }
+ DataType::Int32 => {
+ let re = cast_run_ends.as_primitive::<Int32Type>();
+ Arc::new(RunArray::<Int32Type>::try_new(re,
cast_values.as_ref())?)
+ }
+ DataType::Int64 => {
+ let re = cast_run_ends.as_primitive::<Int64Type>();
+ Arc::new(RunArray::<Int64Type>::try_new(re,
cast_values.as_ref())?)
+ }
+ _ => {
+ return Err(ArrowError::CastError(
+ "Run-end type must be i16, i32, or
i64".to_string(),
+ ));
+ }
+ };
+ Ok(Arc::new(new_run_array))
+ }
+
+ // CASE 2: Expand to logical form
+ _ => {
+ let total_len = run_array.len();
+ let indices = Int32Array::from_iter_values(
+ (0..total_len).map(|i| run_array.get_physical_index(i)
as i32),
+ );
+
+ let taken = take(values.as_ref(), &indices, None)?;
+
+ if taken.data_type() != to_type {
+ cast_with_options(taken.as_ref(), to_type,
cast_options)
+ } else {
+ Ok(taken)
+ }
+ }
+ }
+ }
+
+ _ => Err(ArrowError::CastError(format!(
+ "Cannot cast array of type {:?} to RunEndEncodedArray",
+ array.data_type()
+ ))),
+ }
+}
+
+/// Attempts to cast an array to a RunEndEncoded array with the specified
index type K
+/// and value type. This function performs run-end encoding on the input array.
+///
+/// # Arguments
+/// * `array` - The input array to be run-end encoded
+/// * `value_type` - The target data type for the values in the RunEndEncoded
array
+/// * `cast_options` - Options controlling the casting behavior
+///
+/// # Returns
+/// A `Result` containing the new `RunArray` or an `ArrowError` if casting
fails
+///
+/// # Process
+/// 1. Cast the input array to the target value type if needed
+/// 2. Partition the array to identify runs of consecutive equal values
+/// 3. Build run_ends array indicating where each run terminates
+/// 4. Build values array containing the unique values for each run
+/// 5. Construct and return the RunArray
+pub(crate) fn cast_to_run_end_encoded<K: RunEndIndexType>(
+ array: &ArrayRef,
+ value_type: &DataType,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let mut run_ends_builder = PrimitiveBuilder::<K>::new();
+
+ // Cast the input array to the target value type if necessary
+ let cast_array = if array.data_type() == value_type {
+ array
+ } else {
+ &cast_with_options(array, value_type, cast_options)?
+ };
+
+ // Return early if the array to cast is empty
+ if cast_array.is_empty() {
+ let empty_run_ends = run_ends_builder.finish();
+ let empty_values = make_array(ArrayData::new_empty(value_type));
+ return Ok(Arc::new(RunArray::<K>::try_new(
+ &empty_run_ends,
+ empty_values.as_ref(),
+ )?));
+ }
+
+ // REE arrays are handled by run_end_encoded_cast
+ if let DataType::RunEndEncoded(_, _) = array.data_type() {
+ unreachable!()
+ }
+
+ // Partition the array to identify runs of consecutive equal values
+ let partitions = partition(&[Arc::clone(cast_array)])?;
+ let mut run_ends = Vec::new();
+ let mut values_indexes = Vec::new();
+ let mut last_partition_end = 0;
+ for partition in partitions.ranges() {
+ values_indexes.push(last_partition_end);
+ run_ends.push(partition.end);
+ last_partition_end = partition.end;
+ }
+
+ // Build the run_ends array
+ for run_end in run_ends {
+ run_ends_builder.append_value(
+ K::Native::from_usize(run_end)
+ .ok_or_else(|| ArrowError::CastError("Run end index out of
range".to_string()))?,
+ );
+ }
+ let run_ends_array = run_ends_builder.finish();
+ // Build the values array by taking elements at the run start positions
+ let indices = PrimitiveArray::<UInt32Type>::from_iter_values(
+ values_indexes.iter().map(|&idx| idx as u32),
+ );
+ let values_array = take(&cast_array, &indices, None)?;
+
+ // Create and return the RunArray
+ let run_array = RunArray::<K>::try_new(&run_ends_array,
values_array.as_ref())?;
+ Ok(Arc::new(run_array))
+}
+
+/// Checks if a given data type can be cast to a RunEndEncoded array.
+///
+/// # Arguments
+/// * `from_type` - The source data type to be checked
+/// * `to_type` - The target data type to be checked
+pub(crate) fn can_cast_to_run_end_encoded(from_type: &DataType, to_type:
&DataType) -> bool {
+ match to_type {
+ DataType::RunEndEncoded(_, _) => {
+ // Check if from_type supports equality (can be REE-encoded)
+ match from_type {
+ // Primitive types - support equality
+ DataType::Boolean
+ | DataType::Int8
+ | DataType::Int16
+ | DataType::Int32
+ | DataType::Int64
+ | DataType::UInt8
+ | DataType::UInt16
+ | DataType::UInt32
+ | DataType::UInt64
+ | DataType::Float16
+ | DataType::Float32
+ | DataType::Float64 => true,
+
+ // String types - support equality
+ DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View =>
true,
+
+ // Binary types - support equality
+ DataType::Binary
+ | DataType::LargeBinary
+ | DataType::BinaryView
+ | DataType::FixedSizeBinary(_) => true,
+
+ // Temporal types - support equality
+ DataType::Date32
+ | DataType::Date64
+ | DataType::Timestamp(_, _)
+ | DataType::Time32(_)
+ | DataType::Time64(_)
+ | DataType::Duration(_)
+ | DataType::Interval(_) => true,
+ DataType::Decimal32(_, _)
+ | DataType::Decimal64(_, _)
+ | DataType::Decimal128(_, _)
+ | DataType::Decimal256(_, _) => true,
+ DataType::RunEndEncoded(_, _) => true,
+
+ // Dictionary types are supported
+ DataType::Dictionary(_, _) => true,
+
+ // Unsupported types
Review Comment:
why are these types unsupported? It seems like your algorithm only requires
them to support `partition` and `take`
maybe you can clarify in the comments why they aren't supported
##########
arrow-cast/src/cast/run_array.rs:
##########
@@ -0,0 +1,262 @@
+use crate::cast::*;
+use arrow_ord::partition::partition;
+
+/// Attempts to cast a Run-End Encoded array to another type, handling both
REE-to-REE
+/// and REE-to-other type conversions with proper validation and error
handling.
+///
+/// # Arguments
+/// * `array` - The input Run-End Encoded array to be cast
+/// * `to_type` - The target data type for the casting operation
+/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs
unsafe)
+///
+/// # Returns
+/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting
fails
+///
+/// # Behavior
+/// This function handles two main casting scenarios:
+///
+/// ## Case 1: REE-to-REE Casting
+/// When casting to another Run-End Encoded type:
+/// - Casts both the `values` and `run_ends` to their target types
+/// - Validates that run-end casting only allows upcasts (Int16→Int32,
Int16→Int64, Int32→Int64)
+/// - Preserves the REE structure while updating both fields
+/// - Returns a new `RunArray` with the appropriate run-end type (Int16,
Int32, or Int64)
+///
+/// ## Case 2: REE-to-Other Casting
+/// When casting to a non-REE type:
+/// - Expands the REE array to its logical form by unpacking all values
+/// - Applies the target type casting to the expanded array
+/// - Returns a regular array of the target type (e.g., StringArray,
Int64Array)
+///
+/// # Error Handling, error occurs if:
+/// - the input array is not a Run-End Encoded array
+/// - run-end downcasting would cause overflow
+/// - the target run-end type is unsupported
+/// - Propagates errors from underlying casting operations
+///
+/// # Safety Considerations
+/// - Run-end casting uses `safe: false` to prevent silent overflow
+/// - Only upcasts are allowed for run-ends to maintain valid REE structure
+/// - Unpacking preserves null values and array length
+/// - Type validation ensures only supported run-end types (Int16, Int32,
Int64)
+///
+/// # Performance Notes
+/// - REE-to-REE casting is efficient as it operates on the compressed
structure
+/// - REE-to-other casting requires full unpacking, which may be expensive for
large arrays
+/// - Run-end validation adds minimal overhead for safety
+pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>(
+ array: &dyn Array,
+ to_type: &DataType,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ match array.data_type() {
+ DataType::RunEndEncoded(_, _) => {
+ let run_array = array
+ .as_any()
+ .downcast_ref::<RunArray<K>>()
+ .ok_or_else(|| ArrowError::CastError("Expected
RunArray".to_string()))?;
+
+ let values = run_array.values();
+
+ match to_type {
+ // CASE 1: Stay as RunEndEncoded, cast only the values
+ DataType::RunEndEncoded(target_index_field,
target_value_field) => {
+ let cast_values =
+ cast_with_options(values,
target_value_field.data_type(), cast_options)?;
+
+ let run_ends_array = PrimitiveArray::<K>::from_iter_values(
+ run_array.run_ends().values().iter().copied(),
+ );
+ let cast_run_ends = cast_with_options(
+ &run_ends_array,
+ target_index_field.data_type(),
+ cast_options,
Review Comment:
the comments above say this kernel uses `safe` for this cast, but this code
seems to pass through the parameter options. Can you please update the
documentation to be consistent?
##########
arrow-cast/src/cast/run_array.rs:
##########
@@ -0,0 +1,254 @@
+use crate::cast::*;
+use arrow_ord::partition::partition;
+
+/// Attempts to cast a Run-End Encoded array to another type, handling both
REE-to-REE
+/// and REE-to-other type conversions with proper validation and error
handling.
+///
+/// # Arguments
+/// * `array` - The input Run-End Encoded array to be cast
+/// * `to_type` - The target data type for the casting operation
+/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs
unsafe)
+///
+/// # Returns
+/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting
fails
+///
+/// # Behavior
+/// This function handles two main casting scenarios:
+///
+/// ## Case 1: REE-to-REE Casting
+/// When casting to another Run-End Encoded type:
+/// - Casts both the `values` and `run_ends` to their target types
+/// - Validates that run-end casting only allows upcasts (Int16→Int32,
Int16→Int64, Int32→Int64)
+/// - Preserves the REE structure while updating both fields
+/// - Returns a new `RunArray` with the appropriate run-end type (Int16,
Int32, or Int64)
+///
+/// ## Case 2: REE-to-Other Casting
+/// When casting to a non-REE type:
+/// - Expands the REE array to its logical form by unpacking all values
+/// - Applies the target type casting to the expanded array
+/// - Returns a regular array of the target type (e.g., StringArray,
Int64Array)
+///
+/// # Error Handling, error occurs if:
+/// - the input array is not a Run-End Encoded array
+/// - run-end downcasting would cause overflow
+/// - the target run-end type is unsupported
+/// - Propagates errors from underlying casting operations
+///
+/// # Safety Considerations
+/// - Run-end casting uses `safe: false` to prevent silent overflow
+/// - Only upcasts are allowed for run-ends to maintain valid REE structure
+/// - Unpacking preserves null values and array length
+/// - Type validation ensures only supported run-end types (Int16, Int32,
Int64)
+///
+/// # Performance Notes
+/// - REE-to-REE casting is efficient as it operates on the compressed
structure
+/// - REE-to-other casting requires full unpacking, which may be expensive for
large arrays
+/// - Run-end validation adds minimal overhead for safety
+pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>(
+ array: &dyn Array,
+ to_type: &DataType,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ match array.data_type() {
+ DataType::RunEndEncoded(_, _) => {
+ let run_array = array
+ .as_any()
+ .downcast_ref::<RunArray<K>>()
+ .ok_or_else(|| ArrowError::CastError("Expected
RunArray".to_string()))?;
+
+ let values = run_array.values();
+
+ match to_type {
+ // CASE 1: Stay as RunEndEncoded, cast only the values
+ DataType::RunEndEncoded(target_index_field,
target_value_field) => {
+ let cast_values =
+ cast_with_options(values,
target_value_field.data_type(), cast_options)?;
+
+ let run_ends_array = PrimitiveArray::<K>::from_iter_values(
+ run_array.run_ends().values().iter().copied(),
+ );
+ let cast_run_ends = cast_with_options(
+ &run_ends_array,
+ target_index_field.data_type(),
+ cast_options,
+ )?;
+ let new_run_array: ArrayRef = match
target_index_field.data_type() {
+ DataType::Int16 => {
+ let re = cast_run_ends.as_primitive::<Int16Type>();
+ Arc::new(RunArray::<Int16Type>::try_new(re,
cast_values.as_ref())?)
+ }
+ DataType::Int32 => {
+ let re = cast_run_ends.as_primitive::<Int32Type>();
+ Arc::new(RunArray::<Int32Type>::try_new(re,
cast_values.as_ref())?)
+ }
+ DataType::Int64 => {
+ let re = cast_run_ends.as_primitive::<Int64Type>();
+ Arc::new(RunArray::<Int64Type>::try_new(re,
cast_values.as_ref())?)
+ }
+ _ => {
+ return Err(ArrowError::CastError(
+ "Run-end type must be i16, i32, or
i64".to_string(),
+ ));
+ }
+ };
+ Ok(Arc::new(new_run_array))
+ }
+
+ // CASE 2: Expand to logical form
+ _ => {
+ let total_len = run_array.len();
+ let indices = Int32Array::from_iter_values(
+ (0..total_len).map(|i| run_array.get_physical_index(i)
as i32),
+ );
+
+ let taken = take(values.as_ref(), &indices, None)?;
+
+ if taken.data_type() != to_type {
+ cast_with_options(taken.as_ref(), to_type,
cast_options)
+ } else {
+ Ok(taken)
+ }
+ }
+ }
+ }
+
+ _ => Err(ArrowError::CastError(format!(
+ "Cannot cast array of type {:?} to RunEndEncodedArray",
+ array.data_type()
+ ))),
+ }
+}
+
+/// Attempts to cast an array to a RunEndEncoded array with the specified
index type K
+/// and value type. This function performs run-end encoding on the input array.
+///
+/// # Arguments
+/// * `array` - The input array to be run-end encoded
+/// * `value_type` - The target data type for the values in the RunEndEncoded
array
+/// * `cast_options` - Options controlling the casting behavior
+///
+/// # Returns
+/// A `Result` containing the new `RunArray` or an `ArrowError` if casting
fails
+///
+/// # Process
+/// 1. Cast the input array to the target value type if needed
+/// 2. Partition the array to identify runs of consecutive equal values
+/// 3. Build run_ends array indicating where each run terminates
+/// 4. Build values array containing the unique values for each run
+/// 5. Construct and return the RunArray
+pub(crate) fn cast_to_run_end_encoded<K: RunEndIndexType>(
+ array: &ArrayRef,
+ value_type: &DataType,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let mut run_ends_builder = PrimitiveBuilder::<K>::new();
+
+ // Cast the input array to the target value type if necessary
+ let cast_array = if array.data_type() == value_type {
+ array
+ } else {
+ &cast_with_options(array, value_type, cast_options)?
+ };
+
+ // Return early if the array to cast is empty
+ if cast_array.is_empty() {
+ let empty_run_ends = run_ends_builder.finish();
+ let empty_values = make_array(ArrayData::new_empty(value_type));
+ return Ok(Arc::new(RunArray::<K>::try_new(
+ &empty_run_ends,
+ empty_values.as_ref(),
+ )?));
+ }
+
+ // REE arrays are handled by run_end_encoded_cast
+ if let DataType::RunEndEncoded(_, _) = array.data_type() {
+ unreachable!()
+ }
+
+ // Partition the array to identify runs of consecutive equal values
+ let partitions = partition(&[array.clone()])?;
+ let mut run_ends = Vec::new();
+ let mut values_indexes = Vec::new();
+ let mut array_idx = 0;
+ for partition in partitions.ranges() {
+ values_indexes.push(array_idx);
+ array_idx += partition.end - partition.start;
+ run_ends.push(array_idx);
+ }
+
+ // Build the run_ends array
+ for run_end in run_ends {
+ run_ends_builder.append_value(
+ K::Native::from_usize(run_end)
+ .ok_or_else(|| ArrowError::CastError("Run end index out of
range".to_string()))?,
+ );
+ }
+ let run_ends_array = run_ends_builder.finish();
+ // Build the values array by taking elements at the run start positions
+ let indices = PrimitiveArray::<UInt32Type>::from_iter_values(
+ values_indexes.iter().map(|&idx| idx as u32),
+ );
+ let values_array = take(&cast_array, &indices, None)?;
Review Comment:
I think what @tustvold is getting at is that the `filter` kernel is (very)
fast and thus using it rather than the `take` kernel is likely to be faster.
So I think the idea here is instead of building up `value_indexes` it would
be to implement some way to turn the result of `Partition` into a boolean array
and pass to the filter kernel
I think we can file this idea as a ticket and potentially do it as a follow
on PR / performance optimization
```rust
let partitions = partition(&[Arc::clone(cast_array)])?;
// ....
let filter: BooleanArray = partitions.into_inner(); // not sure about
this API
// call filter instead of take
let values_array = filter(&cast_array, &filter)?;
```
##########
arrow-cast/src/cast/run_array.rs:
##########
@@ -0,0 +1,262 @@
+use crate::cast::*;
+use arrow_ord::partition::partition;
+
+/// Attempts to cast a Run-End Encoded array to another type, handling both
REE-to-REE
+/// and REE-to-other type conversions with proper validation and error
handling.
+///
+/// # Arguments
+/// * `array` - The input Run-End Encoded array to be cast
+/// * `to_type` - The target data type for the casting operation
+/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs
unsafe)
+///
+/// # Returns
+/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting
fails
+///
+/// # Behavior
+/// This function handles two main casting scenarios:
+///
+/// ## Case 1: REE-to-REE Casting
+/// When casting to another Run-End Encoded type:
+/// - Casts both the `values` and `run_ends` to their target types
+/// - Validates that run-end casting only allows upcasts (Int16→Int32,
Int16→Int64, Int32→Int64)
+/// - Preserves the REE structure while updating both fields
+/// - Returns a new `RunArray` with the appropriate run-end type (Int16,
Int32, or Int64)
+///
+/// ## Case 2: REE-to-Other Casting
+/// When casting to a non-REE type:
+/// - Expands the REE array to its logical form by unpacking all values
+/// - Applies the target type casting to the expanded array
+/// - Returns a regular array of the target type (e.g., StringArray,
Int64Array)
+///
+/// # Error Handling, error occurs if:
+/// - the input array is not a Run-End Encoded array
+/// - run-end downcasting would cause overflow
+/// - the target run-end type is unsupported
+/// - Propagates errors from underlying casting operations
+///
+/// # Safety Considerations
+/// - Run-end casting uses `safe: false` to prevent silent overflow
+/// - Only upcasts are allowed for run-ends to maintain valid REE structure
+/// - Unpacking preserves null values and array length
+/// - Type validation ensures only supported run-end types (Int16, Int32,
Int64)
Review Comment:
I am not sure these points have much value (they are implied by the cast
kernel, not specific to REE)
##########
arrow-cast/src/cast/run_array.rs:
##########
@@ -0,0 +1,262 @@
+use crate::cast::*;
+use arrow_ord::partition::partition;
+
+/// Attempts to cast a Run-End Encoded array to another type, handling both
REE-to-REE
+/// and REE-to-other type conversions with proper validation and error
handling.
+///
+/// # Arguments
+/// * `array` - The input Run-End Encoded array to be cast
+/// * `to_type` - The target data type for the casting operation
+/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs
unsafe)
+///
+/// # Returns
+/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting
fails
+///
+/// # Behavior
+/// This function handles two main casting scenarios:
+///
+/// ## Case 1: REE-to-REE Casting
+/// When casting to another Run-End Encoded type:
+/// - Casts both the `values` and `run_ends` to their target types
+/// - Validates that run-end casting only allows upcasts (Int16→Int32,
Int16→Int64, Int32→Int64)
+/// - Preserves the REE structure while updating both fields
+/// - Returns a new `RunArray` with the appropriate run-end type (Int16,
Int32, or Int64)
+///
+/// ## Case 2: REE-to-Other Casting
+/// When casting to a non-REE type:
+/// - Expands the REE array to its logical form by unpacking all values
+/// - Applies the target type casting to the expanded array
+/// - Returns a regular array of the target type (e.g., StringArray,
Int64Array)
+///
+/// # Error Handling, error occurs if:
+/// - the input array is not a Run-End Encoded array
+/// - run-end downcasting would cause overflow
+/// - the target run-end type is unsupported
+/// - Propagates errors from underlying casting operations
+///
+/// # Safety Considerations
+/// - Run-end casting uses `safe: false` to prevent silent overflow
+/// - Only upcasts are allowed for run-ends to maintain valid REE structure
+/// - Unpacking preserves null values and array length
+/// - Type validation ensures only supported run-end types (Int16, Int32,
Int64)
+///
+/// # Performance Notes
+/// - REE-to-REE casting is efficient as it operates on the compressed
structure
+/// - REE-to-other casting requires full unpacking, which may be expensive for
large arrays
+/// - Run-end validation adds minimal overhead for safety
+pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>(
+ array: &dyn Array,
+ to_type: &DataType,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ match array.data_type() {
+ DataType::RunEndEncoded(_, _) => {
+ let run_array = array
+ .as_any()
+ .downcast_ref::<RunArray<K>>()
+ .ok_or_else(|| ArrowError::CastError("Expected
RunArray".to_string()))?;
+
+ let values = run_array.values();
+
+ match to_type {
+ // CASE 1: Stay as RunEndEncoded, cast only the values
+ DataType::RunEndEncoded(target_index_field,
target_value_field) => {
+ let cast_values =
+ cast_with_options(values,
target_value_field.data_type(), cast_options)?;
+
+ let run_ends_array = PrimitiveArray::<K>::from_iter_values(
+ run_array.run_ends().values().iter().copied(),
+ );
+ let cast_run_ends = cast_with_options(
+ &run_ends_array,
+ target_index_field.data_type(),
+ cast_options,
+ )?;
+ let new_run_array: ArrayRef = match
target_index_field.data_type() {
+ DataType::Int16 => {
+ let re = cast_run_ends.as_primitive::<Int16Type>();
+ Arc::new(RunArray::<Int16Type>::try_new(re,
cast_values.as_ref())?)
+ }
+ DataType::Int32 => {
+ let re = cast_run_ends.as_primitive::<Int32Type>();
+ Arc::new(RunArray::<Int32Type>::try_new(re,
cast_values.as_ref())?)
+ }
+ DataType::Int64 => {
+ let re = cast_run_ends.as_primitive::<Int64Type>();
+ Arc::new(RunArray::<Int64Type>::try_new(re,
cast_values.as_ref())?)
+ }
+ _ => {
+ return Err(ArrowError::CastError(
+ "Run-end type must be i16, i32, or
i64".to_string(),
+ ));
+ }
+ };
+ Ok(Arc::new(new_run_array))
+ }
+
+ // CASE 2: Expand to logical form
+ _ => {
+ let total_len = run_array.len();
+ let indices = Int32Array::from_iter_values(
+ (0..total_len).map(|i| run_array.get_physical_index(i)
as i32),
+ );
+
+ let taken = take(values.as_ref(), &indices, None)?;
+
+ if taken.data_type() != to_type {
+ cast_with_options(taken.as_ref(), to_type,
cast_options)
+ } else {
+ Ok(taken)
+ }
+ }
+ }
+ }
+
+ _ => Err(ArrowError::CastError(format!(
+ "Cannot cast array of type {:?} to RunEndEncodedArray",
+ array.data_type()
+ ))),
+ }
+}
+
+/// Attempts to cast an array to a RunEndEncoded array with the specified
index type K
+/// and value type. This function performs run-end encoding on the input array.
+///
+/// # Arguments
+/// * `array` - The input array to be run-end encoded
+/// * `value_type` - The target data type for the values in the RunEndEncoded
array
+/// * `cast_options` - Options controlling the casting behavior
+///
+/// # Returns
+/// A `Result` containing the new `RunArray` or an `ArrowError` if casting
fails
+///
+/// # Process
+/// 1. Cast the input array to the target value type if needed
+/// 2. Partition the array to identify runs of consecutive equal values
+/// 3. Build run_ends array indicating where each run terminates
+/// 4. Build values array containing the unique values for each run
+/// 5. Construct and return the RunArray
+pub(crate) fn cast_to_run_end_encoded<K: RunEndIndexType>(
+ array: &ArrayRef,
+ value_type: &DataType,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let mut run_ends_builder = PrimitiveBuilder::<K>::new();
+
+ // Cast the input array to the target value type if necessary
+ let cast_array = if array.data_type() == value_type {
+ array
+ } else {
+ &cast_with_options(array, value_type, cast_options)?
+ };
+
+ // Return early if the array to cast is empty
+ if cast_array.is_empty() {
+ let empty_run_ends = run_ends_builder.finish();
+ let empty_values = make_array(ArrayData::new_empty(value_type));
+ return Ok(Arc::new(RunArray::<K>::try_new(
+ &empty_run_ends,
+ empty_values.as_ref(),
+ )?));
+ }
+
+ // REE arrays are handled by run_end_encoded_cast
+ if let DataType::RunEndEncoded(_, _) = array.data_type() {
+ unreachable!()
Review Comment:
can we please return an error here rather than panic so that:
1. this is consistent with the code above
2. any bugs result in errors rather than panics
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -11415,4 +11446,434 @@ mod tests {
"Invalid argument error: -1.0 is too small to store in a Decimal32
of precision 1. Min is -0.9"
);
}
+
+ #[test]
+ fn test_run_end_encoded_to_primitive() {
+ // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3]
+ let run_ends = Int32Array::from(vec![2, 5, 6]);
+ let values = Int32Array::from(vec![1, 2, 3]);
+ let run_array = RunArray::<Int32Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(run_array) as ArrayRef;
+ // Cast to Int64
+ let cast_result = cast(&array_ref, &DataType::Int64).unwrap();
+ // Verify the result is a RunArray with Int64 values
+ let result_run_array =
cast_result.as_any().downcast_ref::<Int64Array>().unwrap();
+ assert_eq!(
+ result_run_array.values(),
+ &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]
+ );
+ }
+
+ #[test]
+ fn test_run_end_encoded_to_string() {
+ let run_ends = Int32Array::from(vec![2, 3, 5]);
+ let values = Int32Array::from(vec![10, 20, 30]);
+ let run_array = RunArray::<Int32Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(run_array) as ArrayRef;
+
+ // Cast to String
+ let cast_result = cast(&array_ref, &DataType::Utf8).unwrap();
+
+ // Verify the result is a RunArray with String values
+ let result_array =
cast_result.as_any().downcast_ref::<StringArray>().unwrap();
+ // Check that values are correct
+ assert_eq!(result_array.value(0), "10");
+ assert_eq!(result_array.value(1), "10");
+ assert_eq!(result_array.value(2), "20");
+ }
+
+ #[test]
+ fn test_primitive_to_run_end_encoded() {
+ // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3]
+ let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, Int32>
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is a RunArray
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check run structure: runs should end at positions [2, 5, 6]
+ assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]);
+
+ // Check values: should be [1, 2, 3]
+ let values_array =
result_run_array.values().as_primitive::<Int32Type>();
+ assert_eq!(values_array.values(), &[1, 2, 3]);
+ }
+
+ #[test]
+ fn test_primitive_to_run_end_encoded_with_nulls() {
+ let source_array = Int32Array::from(vec![
+ Some(1),
+ Some(1),
+ None,
+ None,
+ Some(2),
+ Some(2),
+ Some(3),
+ Some(3),
+ None,
+ None,
+ Some(4),
+ Some(4),
+ Some(5),
+ Some(5),
+ None,
+ None,
+ ]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+ assert_eq!(
+ result_run_array.run_ends().values(),
+ &[2, 4, 6, 8, 10, 12, 14, 16]
+ );
+ assert_eq!(
+ result_run_array
+ .values()
+ .as_primitive::<Int32Type>()
+ .values(),
+ &[1, 0, 2, 3, 0, 4, 5, 0]
+ );
+ assert_eq!(result_run_array.values().null_count(), 3);
+ }
+
+ #[test]
+ fn test_primitive_to_run_end_encoded_with_nulls_consecutive() {
+ let source_array = Int64Array::from(vec![
+ Some(1),
+ Some(1),
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ Some(4),
+ Some(20),
+ Some(500),
+ Some(500),
+ None,
+ None,
+ ]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int16, false)),
+ Arc::new(Field::new("values", DataType::Int64, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int16Type>>()
+ .unwrap();
+ assert_eq!(
+ result_run_array.run_ends().values(),
+ &[2, 10, 11, 12, 14, 16]
+ );
+ assert_eq!(
+ result_run_array
+ .values()
+ .as_primitive::<Int64Type>()
+ .values(),
+ &[1, 0, 4, 20, 500, 0]
+ );
+ assert_eq!(result_run_array.values().null_count(), 2);
+ }
+
+ #[test]
+ fn test_string_to_run_end_encoded() {
+ // Create a String array with repeated values: ["a", "a", "b", "c",
"c"]
+ let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, String>
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Utf8, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is a RunArray
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check run structure: runs should end at positions [2, 3, 5]
+ assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]);
+
+ // Check values: should be ["a", "b", "c"]
+ let values_array = result_run_array.values().as_string::<i32>();
+ assert_eq!(values_array.value(0), "a");
+ assert_eq!(values_array.value(1), "b");
+ assert_eq!(values_array.value(2), "c");
+ }
+
+ #[test]
+ fn test_cast_with_type_conversion() {
Review Comment:
this test seems to be redundant with `test_run_end_encoded_to_string` above
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -11415,4 +11446,434 @@ mod tests {
"Invalid argument error: -1.0 is too small to store in a Decimal32
of precision 1. Min is -0.9"
);
}
+
+ #[test]
+ fn test_run_end_encoded_to_primitive() {
+ // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3]
+ let run_ends = Int32Array::from(vec![2, 5, 6]);
+ let values = Int32Array::from(vec![1, 2, 3]);
+ let run_array = RunArray::<Int32Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(run_array) as ArrayRef;
+ // Cast to Int64
+ let cast_result = cast(&array_ref, &DataType::Int64).unwrap();
+ // Verify the result is a RunArray with Int64 values
+ let result_run_array =
cast_result.as_any().downcast_ref::<Int64Array>().unwrap();
+ assert_eq!(
+ result_run_array.values(),
+ &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]
+ );
+ }
+
+ #[test]
+ fn test_run_end_encoded_to_string() {
+ let run_ends = Int32Array::from(vec![2, 3, 5]);
+ let values = Int32Array::from(vec![10, 20, 30]);
+ let run_array = RunArray::<Int32Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(run_array) as ArrayRef;
+
+ // Cast to String
+ let cast_result = cast(&array_ref, &DataType::Utf8).unwrap();
+
+ // Verify the result is a RunArray with String values
+ let result_array =
cast_result.as_any().downcast_ref::<StringArray>().unwrap();
+ // Check that values are correct
+ assert_eq!(result_array.value(0), "10");
+ assert_eq!(result_array.value(1), "10");
+ assert_eq!(result_array.value(2), "20");
+ }
+
+ #[test]
+ fn test_primitive_to_run_end_encoded() {
+ // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3]
+ let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, Int32>
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is a RunArray
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check run structure: runs should end at positions [2, 5, 6]
+ assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]);
+
+ // Check values: should be [1, 2, 3]
+ let values_array =
result_run_array.values().as_primitive::<Int32Type>();
+ assert_eq!(values_array.values(), &[1, 2, 3]);
+ }
+
+ #[test]
+ fn test_primitive_to_run_end_encoded_with_nulls() {
+ let source_array = Int32Array::from(vec![
+ Some(1),
+ Some(1),
+ None,
+ None,
+ Some(2),
+ Some(2),
+ Some(3),
+ Some(3),
+ None,
+ None,
+ Some(4),
+ Some(4),
+ Some(5),
+ Some(5),
+ None,
+ None,
+ ]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+ assert_eq!(
+ result_run_array.run_ends().values(),
+ &[2, 4, 6, 8, 10, 12, 14, 16]
+ );
+ assert_eq!(
+ result_run_array
+ .values()
+ .as_primitive::<Int32Type>()
+ .values(),
+ &[1, 0, 2, 3, 0, 4, 5, 0]
+ );
+ assert_eq!(result_run_array.values().null_count(), 3);
+ }
+
+ #[test]
+ fn test_primitive_to_run_end_encoded_with_nulls_consecutive() {
+ let source_array = Int64Array::from(vec![
+ Some(1),
+ Some(1),
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ Some(4),
+ Some(20),
+ Some(500),
+ Some(500),
+ None,
+ None,
+ ]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int16, false)),
+ Arc::new(Field::new("values", DataType::Int64, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int16Type>>()
+ .unwrap();
+ assert_eq!(
+ result_run_array.run_ends().values(),
+ &[2, 10, 11, 12, 14, 16]
+ );
+ assert_eq!(
+ result_run_array
+ .values()
+ .as_primitive::<Int64Type>()
+ .values(),
+ &[1, 0, 4, 20, 500, 0]
+ );
+ assert_eq!(result_run_array.values().null_count(), 2);
+ }
+
+ #[test]
+ fn test_string_to_run_end_encoded() {
+ // Create a String array with repeated values: ["a", "a", "b", "c",
"c"]
+ let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, String>
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Utf8, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is a RunArray
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check run structure: runs should end at positions [2, 3, 5]
+ assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]);
+
+ // Check values: should be ["a", "b", "c"]
+ let values_array = result_run_array.values().as_string::<i32>();
+ assert_eq!(values_array.value(0), "a");
+ assert_eq!(values_array.value(1), "b");
+ assert_eq!(values_array.value(2), "c");
+ }
+
+ #[test]
+ fn test_cast_with_type_conversion() {
+ // Create an Int32 array: [1, 1, 2, 2, 3]
+ let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, String> (values get converted to
strings)
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Utf8, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is a RunArray with String values
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check that values were converted to strings
+ assert_eq!(result_run_array.values().data_type(), &DataType::Utf8);
+
+ // Check run structure: runs should end at positions [2, 4, 5]
+ assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]);
+
+ // Check values: should be ["1", "2", "3"]
+ let values_array = result_run_array.values().as_string::<i32>();
+ assert_eq!(values_array.value(0), "1");
+ assert_eq!(values_array.value(1), "2");
+ assert_eq!(values_array.value(2), "3");
+ }
+
+ #[test]
+ fn test_empty_array_to_run_end_encoded() {
+ // Create an empty Int32 array
+ let source_array = Int32Array::from(Vec::<i32>::new());
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, Int32>
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is an empty RunArray
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check that both run_ends and values are empty
+ assert_eq!(result_run_array.run_ends().len(), 0);
+ assert_eq!(result_run_array.values().len(), 0);
+ }
+
+ #[test]
+ fn test_run_end_encoded_with_nulls() {
+ // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2]
+ let run_ends = Int32Array::from(vec![2, 3, 5]);
+ let values = Int32Array::from(vec![Some(1), None, Some(2)]);
+ let run_array = RunArray::<Int32Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(run_array) as ArrayRef;
+
+ // Cast to String
+ let cast_result = cast(&array_ref, &DataType::Utf8).unwrap();
+
+ // Verify the result preserves nulls
+ let result_run_array =
cast_result.as_any().downcast_ref::<StringArray>().unwrap();
+ assert_eq!(result_run_array.value(0), "1");
+ assert!(result_run_array.is_null(2));
+ assert_eq!(result_run_array.value(4), "2");
+ }
+
+ #[test]
+ fn test_different_index_types() {
+ // Test with Int16 index type
+ let source_array = Int32Array::from(vec![1, 1, 2, 3, 3]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int16, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ assert_eq!(cast_result.data_type(), &target_type);
+
+ // Test with Int64 index type
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int64, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ assert_eq!(cast_result.data_type(), &target_type);
+ }
+
+ #[test]
+ fn test_unsupported_cast_to_run_end_encoded() {
+ // Create a Struct array - complex nested type that might not be
supported
+ let field = Field::new("item", DataType::Int32, false);
+ let struct_array = StructArray::from(vec![(
+ Arc::new(field),
+ Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef,
+ )]);
+ let array_ref = Arc::new(struct_array) as ArrayRef;
+
+ // This should fail because:
+ // 1. The target type is not RunEndEncoded
+ // 2. The target type is not supported for casting from StructArray
+ let cast_result = cast(&array_ref, &DataType::FixedSizeBinary(10));
+
+ // Expect this to fail
+ assert!(cast_result.is_err());
+ }
+
+ /// Test casting RunEndEncoded<Int64, String> to RunEndEncoded<Int16,
String> should fail
+ #[test]
+ fn test_cast_run_end_encoded_int64_to_int16_should_fail() {
Review Comment:
can you also please add a test when `safe: true` ?
I would expect in that case several of the entries are null
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -11415,4 +11446,434 @@ mod tests {
"Invalid argument error: -1.0 is too small to store in a Decimal32
of precision 1. Min is -0.9"
);
}
+
+ #[test]
+ fn test_run_end_encoded_to_primitive() {
+ // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3]
+ let run_ends = Int32Array::from(vec![2, 5, 6]);
+ let values = Int32Array::from(vec![1, 2, 3]);
+ let run_array = RunArray::<Int32Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(run_array) as ArrayRef;
+ // Cast to Int64
+ let cast_result = cast(&array_ref, &DataType::Int64).unwrap();
+ // Verify the result is a RunArray with Int64 values
+ let result_run_array =
cast_result.as_any().downcast_ref::<Int64Array>().unwrap();
+ assert_eq!(
+ result_run_array.values(),
+ &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]
+ );
+ }
+
+ #[test]
+ fn test_run_end_encoded_to_string() {
+ let run_ends = Int32Array::from(vec![2, 3, 5]);
+ let values = Int32Array::from(vec![10, 20, 30]);
+ let run_array = RunArray::<Int32Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(run_array) as ArrayRef;
+
+ // Cast to String
+ let cast_result = cast(&array_ref, &DataType::Utf8).unwrap();
+
+ // Verify the result is a RunArray with String values
+ let result_array =
cast_result.as_any().downcast_ref::<StringArray>().unwrap();
+ // Check that values are correct
+ assert_eq!(result_array.value(0), "10");
+ assert_eq!(result_array.value(1), "10");
+ assert_eq!(result_array.value(2), "20");
+ }
+
+ #[test]
+ fn test_primitive_to_run_end_encoded() {
+ // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3]
+ let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, Int32>
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is a RunArray
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check run structure: runs should end at positions [2, 5, 6]
+ assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]);
+
+ // Check values: should be [1, 2, 3]
+ let values_array =
result_run_array.values().as_primitive::<Int32Type>();
+ assert_eq!(values_array.values(), &[1, 2, 3]);
+ }
+
+ #[test]
+ fn test_primitive_to_run_end_encoded_with_nulls() {
+ let source_array = Int32Array::from(vec![
+ Some(1),
+ Some(1),
+ None,
+ None,
+ Some(2),
+ Some(2),
+ Some(3),
+ Some(3),
+ None,
+ None,
+ Some(4),
+ Some(4),
+ Some(5),
+ Some(5),
+ None,
+ None,
+ ]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+ assert_eq!(
+ result_run_array.run_ends().values(),
+ &[2, 4, 6, 8, 10, 12, 14, 16]
+ );
+ assert_eq!(
+ result_run_array
+ .values()
+ .as_primitive::<Int32Type>()
+ .values(),
+ &[1, 0, 2, 3, 0, 4, 5, 0]
+ );
+ assert_eq!(result_run_array.values().null_count(), 3);
+ }
+
+ #[test]
+ fn test_primitive_to_run_end_encoded_with_nulls_consecutive() {
+ let source_array = Int64Array::from(vec![
+ Some(1),
+ Some(1),
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ Some(4),
+ Some(20),
+ Some(500),
+ Some(500),
+ None,
+ None,
+ ]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int16, false)),
+ Arc::new(Field::new("values", DataType::Int64, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int16Type>>()
+ .unwrap();
+ assert_eq!(
+ result_run_array.run_ends().values(),
+ &[2, 10, 11, 12, 14, 16]
+ );
+ assert_eq!(
+ result_run_array
+ .values()
+ .as_primitive::<Int64Type>()
+ .values(),
+ &[1, 0, 4, 20, 500, 0]
+ );
+ assert_eq!(result_run_array.values().null_count(), 2);
+ }
+
+ #[test]
+ fn test_string_to_run_end_encoded() {
+ // Create a String array with repeated values: ["a", "a", "b", "c",
"c"]
+ let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, String>
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Utf8, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is a RunArray
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check run structure: runs should end at positions [2, 3, 5]
+ assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]);
+
+ // Check values: should be ["a", "b", "c"]
+ let values_array = result_run_array.values().as_string::<i32>();
+ assert_eq!(values_array.value(0), "a");
+ assert_eq!(values_array.value(1), "b");
+ assert_eq!(values_array.value(2), "c");
+ }
+
+ #[test]
+ fn test_cast_with_type_conversion() {
+ // Create an Int32 array: [1, 1, 2, 2, 3]
+ let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, String> (values get converted to
strings)
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Utf8, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is a RunArray with String values
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check that values were converted to strings
+ assert_eq!(result_run_array.values().data_type(), &DataType::Utf8);
+
+ // Check run structure: runs should end at positions [2, 4, 5]
+ assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]);
+
+ // Check values: should be ["1", "2", "3"]
+ let values_array = result_run_array.values().as_string::<i32>();
+ assert_eq!(values_array.value(0), "1");
+ assert_eq!(values_array.value(1), "2");
+ assert_eq!(values_array.value(2), "3");
+ }
+
+ #[test]
+ fn test_empty_array_to_run_end_encoded() {
+ // Create an empty Int32 array
+ let source_array = Int32Array::from(Vec::<i32>::new());
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, Int32>
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is an empty RunArray
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check that both run_ends and values are empty
+ assert_eq!(result_run_array.run_ends().len(), 0);
+ assert_eq!(result_run_array.values().len(), 0);
+ }
+
+ #[test]
+ fn test_run_end_encoded_with_nulls() {
+ // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2]
+ let run_ends = Int32Array::from(vec![2, 3, 5]);
+ let values = Int32Array::from(vec![Some(1), None, Some(2)]);
+ let run_array = RunArray::<Int32Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(run_array) as ArrayRef;
+
+ // Cast to String
+ let cast_result = cast(&array_ref, &DataType::Utf8).unwrap();
+
+ // Verify the result preserves nulls
+ let result_run_array =
cast_result.as_any().downcast_ref::<StringArray>().unwrap();
+ assert_eq!(result_run_array.value(0), "1");
+ assert!(result_run_array.is_null(2));
+ assert_eq!(result_run_array.value(4), "2");
+ }
+
+ #[test]
+ fn test_different_index_types() {
+ // Test with Int16 index type
+ let source_array = Int32Array::from(vec![1, 1, 2, 3, 3]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int16, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ assert_eq!(cast_result.data_type(), &target_type);
+
+ // Test with Int64 index type
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int64, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ assert_eq!(cast_result.data_type(), &target_type);
Review Comment:
I think this test should also verify the actual indexes and values
##########
arrow-cast/Cargo.toml:
##########
@@ -43,6 +43,7 @@ force_validate = []
arrow-array = { workspace = true }
arrow-buffer = { workspace = true }
arrow-data = { workspace = true }
+arrow-ord = { workspace = true }
Review Comment:
I think we are trying to keep the number of dependencies to a minimum
I see this is used to call `partition` which is clever but overly general. I
think you can also partition a (single) column using eq and an offset to look
for consecutive rows which are different.
Something like
```rust
let arr = ...;
let arr_shift1 = arr.slice(1, arr.len()-1);
let transitions = eq(arr, arr_shift_1);
```
However, the `eq` kernels is also in `arrow-ord` so I am not sure there is a
way around it
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -11415,4 +11446,434 @@ mod tests {
"Invalid argument error: -1.0 is too small to store in a Decimal32
of precision 1. Min is -0.9"
);
}
+
+ #[test]
+ fn test_run_end_encoded_to_primitive() {
+ // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3]
+ let run_ends = Int32Array::from(vec![2, 5, 6]);
+ let values = Int32Array::from(vec![1, 2, 3]);
+ let run_array = RunArray::<Int32Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(run_array) as ArrayRef;
+ // Cast to Int64
+ let cast_result = cast(&array_ref, &DataType::Int64).unwrap();
+ // Verify the result is a RunArray with Int64 values
+ let result_run_array =
cast_result.as_any().downcast_ref::<Int64Array>().unwrap();
+ assert_eq!(
+ result_run_array.values(),
+ &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]
+ );
+ }
+
+ #[test]
+ fn test_run_end_encoded_to_string() {
+ let run_ends = Int32Array::from(vec![2, 3, 5]);
+ let values = Int32Array::from(vec![10, 20, 30]);
+ let run_array = RunArray::<Int32Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(run_array) as ArrayRef;
+
+ // Cast to String
+ let cast_result = cast(&array_ref, &DataType::Utf8).unwrap();
+
+ // Verify the result is a RunArray with String values
+ let result_array =
cast_result.as_any().downcast_ref::<StringArray>().unwrap();
+ // Check that values are correct
+ assert_eq!(result_array.value(0), "10");
+ assert_eq!(result_array.value(1), "10");
+ assert_eq!(result_array.value(2), "20");
+ }
+
+ #[test]
+ fn test_primitive_to_run_end_encoded() {
+ // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3]
+ let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, Int32>
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is a RunArray
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check run structure: runs should end at positions [2, 5, 6]
+ assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]);
+
+ // Check values: should be [1, 2, 3]
+ let values_array =
result_run_array.values().as_primitive::<Int32Type>();
+ assert_eq!(values_array.values(), &[1, 2, 3]);
+ }
+
+ #[test]
+ fn test_primitive_to_run_end_encoded_with_nulls() {
+ let source_array = Int32Array::from(vec![
+ Some(1),
+ Some(1),
+ None,
+ None,
+ Some(2),
+ Some(2),
+ Some(3),
+ Some(3),
+ None,
+ None,
+ Some(4),
+ Some(4),
+ Some(5),
+ Some(5),
+ None,
+ None,
+ ]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+ assert_eq!(
+ result_run_array.run_ends().values(),
+ &[2, 4, 6, 8, 10, 12, 14, 16]
+ );
+ assert_eq!(
+ result_run_array
+ .values()
+ .as_primitive::<Int32Type>()
+ .values(),
+ &[1, 0, 2, 3, 0, 4, 5, 0]
+ );
+ assert_eq!(result_run_array.values().null_count(), 3);
+ }
+
+ #[test]
+ fn test_primitive_to_run_end_encoded_with_nulls_consecutive() {
+ let source_array = Int64Array::from(vec![
+ Some(1),
+ Some(1),
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ Some(4),
+ Some(20),
+ Some(500),
+ Some(500),
+ None,
+ None,
+ ]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int16, false)),
+ Arc::new(Field::new("values", DataType::Int64, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int16Type>>()
+ .unwrap();
+ assert_eq!(
+ result_run_array.run_ends().values(),
+ &[2, 10, 11, 12, 14, 16]
+ );
+ assert_eq!(
+ result_run_array
+ .values()
+ .as_primitive::<Int64Type>()
+ .values(),
+ &[1, 0, 4, 20, 500, 0]
+ );
+ assert_eq!(result_run_array.values().null_count(), 2);
+ }
+
+ #[test]
+ fn test_string_to_run_end_encoded() {
+ // Create a String array with repeated values: ["a", "a", "b", "c",
"c"]
+ let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, String>
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Utf8, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is a RunArray
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check run structure: runs should end at positions [2, 3, 5]
+ assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]);
+
+ // Check values: should be ["a", "b", "c"]
+ let values_array = result_run_array.values().as_string::<i32>();
+ assert_eq!(values_array.value(0), "a");
+ assert_eq!(values_array.value(1), "b");
+ assert_eq!(values_array.value(2), "c");
+ }
+
+ #[test]
+ fn test_cast_with_type_conversion() {
+ // Create an Int32 array: [1, 1, 2, 2, 3]
+ let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, String> (values get converted to
strings)
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Utf8, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is a RunArray with String values
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check that values were converted to strings
+ assert_eq!(result_run_array.values().data_type(), &DataType::Utf8);
+
+ // Check run structure: runs should end at positions [2, 4, 5]
+ assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]);
+
+ // Check values: should be ["1", "2", "3"]
+ let values_array = result_run_array.values().as_string::<i32>();
+ assert_eq!(values_array.value(0), "1");
+ assert_eq!(values_array.value(1), "2");
+ assert_eq!(values_array.value(2), "3");
+ }
+
+ #[test]
+ fn test_empty_array_to_run_end_encoded() {
+ // Create an empty Int32 array
+ let source_array = Int32Array::from(Vec::<i32>::new());
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ // Cast to RunEndEncoded<Int32, Int32>
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+
+ // Verify the result is an empty RunArray
+ let result_run_array = cast_result
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+
+ // Check that both run_ends and values are empty
+ assert_eq!(result_run_array.run_ends().len(), 0);
+ assert_eq!(result_run_array.values().len(), 0);
+ }
+
+ #[test]
+ fn test_run_end_encoded_with_nulls() {
+ // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2]
+ let run_ends = Int32Array::from(vec![2, 3, 5]);
+ let values = Int32Array::from(vec![Some(1), None, Some(2)]);
+ let run_array = RunArray::<Int32Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(run_array) as ArrayRef;
+
+ // Cast to String
+ let cast_result = cast(&array_ref, &DataType::Utf8).unwrap();
+
+ // Verify the result preserves nulls
+ let result_run_array =
cast_result.as_any().downcast_ref::<StringArray>().unwrap();
+ assert_eq!(result_run_array.value(0), "1");
+ assert!(result_run_array.is_null(2));
+ assert_eq!(result_run_array.value(4), "2");
+ }
+
+ #[test]
+ fn test_different_index_types() {
+ // Test with Int16 index type
+ let source_array = Int32Array::from(vec![1, 1, 2, 3, 3]);
+ let array_ref = Arc::new(source_array) as ArrayRef;
+
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int16, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ assert_eq!(cast_result.data_type(), &target_type);
+
+ // Test with Int64 index type
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int64, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ );
+ let cast_result = cast(&array_ref, &target_type).unwrap();
+ assert_eq!(cast_result.data_type(), &target_type);
+ }
+
+ #[test]
+ fn test_unsupported_cast_to_run_end_encoded() {
+ // Create a Struct array - complex nested type that might not be
supported
+ let field = Field::new("item", DataType::Int32, false);
+ let struct_array = StructArray::from(vec![(
+ Arc::new(field),
+ Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef,
+ )]);
+ let array_ref = Arc::new(struct_array) as ArrayRef;
+
+ // This should fail because:
+ // 1. The target type is not RunEndEncoded
+ // 2. The target type is not supported for casting from StructArray
+ let cast_result = cast(&array_ref, &DataType::FixedSizeBinary(10));
+
+ // Expect this to fail
+ assert!(cast_result.is_err());
+ }
+
+ /// Test casting RunEndEncoded<Int64, String> to RunEndEncoded<Int16,
String> should fail
+ #[test]
+ fn test_cast_run_end_encoded_int64_to_int16_should_fail() {
+ // Construct a valid REE array with Int64 run-ends
+ let run_ends = Int64Array::from(vec![100_000, 400_000, 700_000]); //
values too large for Int16
+ let values = StringArray::from(vec!["a", "b", "c"]);
+
+ let ree_array = RunArray::<Int64Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(ree_array) as ArrayRef;
+
+ // Attempt to cast to RunEndEncoded<Int16, Utf8>
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int16, false)),
+ Arc::new(Field::new("values", DataType::Utf8, true)),
+ );
+ let cast_options = CastOptions {
+ safe: false, // This should make it fail instead of returning nulls
+ format_options: FormatOptions::default(),
+ };
+
+ // This should fail due to run-end overflow
+ let result: Result<Arc<dyn Array + 'static>, ArrowError> =
+ cast_with_options(&array_ref, &target_type, &cast_options);
+
+ let e = result.err().expect("Cast should have failed but succeeded");
+ assert!(
+ e.to_string()
+ .contains("Cast error: Can't cast value 100000 to type Int16")
+ );
+ }
+
+ /// Test casting RunEndEncoded<Int16, String> to RunEndEncoded<Int64,
String> should succeed
+ #[test]
+ fn test_cast_run_end_encoded_int16_to_int64_should_succeed() {
+ // Construct a valid REE array with Int16 run-ends
+ let run_ends = Int16Array::from(vec![2, 5, 8]); // values that fit in
Int16
+ let values = StringArray::from(vec!["a", "b", "c"]);
+
+ let ree_array = RunArray::<Int16Type>::try_new(&run_ends,
&values).unwrap();
+ let array_ref = Arc::new(ree_array) as ArrayRef;
+
+ // Attempt to cast to RunEndEncoded<Int64, Utf8> (upcast should
succeed)
+ let target_type = DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int64, false)),
+ Arc::new(Field::new("values", DataType::Utf8, true)),
+ );
+ let cast_options = CastOptions {
+ safe: false,
+ format_options: FormatOptions::default(),
+ };
+
+ // This should succeed due to valid upcast
+ let result: Result<Arc<dyn Array + 'static>, ArrowError> =
+ cast_with_options(&array_ref, &target_type, &cast_options);
+
+ let array_ref = result.expect("Cast should have succeeded but failed");
+ // Downcast to RunArray<Int64Type>
+ let run_array = array_ref
+ .as_any()
+ .downcast_ref::<RunArray<Int64Type>>()
+ .unwrap();
+
+ // Verify the cast worked correctly
+ // Assert the values were cast correctly
+ assert_eq!(run_array.run_ends().values(), &[2i64, 5i64, 8i64]);
+ assert_eq!(run_array.values().as_string::<i32>().value(0), "a");
+ assert_eq!(run_array.values().as_string::<i32>().value(1), "b");
+ assert_eq!(run_array.values().as_string::<i32>().value(2), "c");
+ }
+
+ /// Test casting RunEndEncoded<Int32, String> to RunEndEncoded<Int16,
String> should fail
+ #[test]
Review Comment:
I am not sure this one adds much coverage over the test above for u64->u16
(it is fine to leave in though)
##########
arrow-cast/src/cast/run_array.rs:
##########
@@ -0,0 +1,262 @@
+use crate::cast::*;
+use arrow_ord::partition::partition;
+
+/// Attempts to cast a Run-End Encoded array to another type, handling both
REE-to-REE
+/// and REE-to-other type conversions with proper validation and error
handling.
+///
+/// # Arguments
+/// * `array` - The input Run-End Encoded array to be cast
+/// * `to_type` - The target data type for the casting operation
+/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs
unsafe)
+///
+/// # Returns
+/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting
fails
+///
+/// # Behavior
+/// This function handles two main casting scenarios:
+///
+/// ## Case 1: REE-to-REE Casting
+/// When casting to another Run-End Encoded type:
+/// - Casts both the `values` and `run_ends` to their target types
+/// - Validates that run-end casting only allows upcasts (Int16→Int32,
Int16→Int64, Int32→Int64)
+/// - Preserves the REE structure while updating both fields
+/// - Returns a new `RunArray` with the appropriate run-end type (Int16,
Int32, or Int64)
+///
+/// ## Case 2: REE-to-Other Casting
+/// When casting to a non-REE type:
+/// - Expands the REE array to its logical form by unpacking all values
+/// - Applies the target type casting to the expanded array
+/// - Returns a regular array of the target type (e.g., StringArray,
Int64Array)
+///
+/// # Error Handling, error occurs if:
+/// - the input array is not a Run-End Encoded array
+/// - run-end downcasting would cause overflow
+/// - the target run-end type is unsupported
+/// - Propagates errors from underlying casting operations
+///
+/// # Safety Considerations
+/// - Run-end casting uses `safe: false` to prevent silent overflow
+/// - Only upcasts are allowed for run-ends to maintain valid REE structure
+/// - Unpacking preserves null values and array length
+/// - Type validation ensures only supported run-end types (Int16, Int32,
Int64)
+///
+/// # Performance Notes
+/// - REE-to-REE casting is efficient as it operates on the compressed
structure
+/// - REE-to-other casting requires full unpacking, which may be expensive for
large arrays
+/// - Run-end validation adds minimal overhead for safety
Review Comment:
if you are going to claim this , can you please back it up with data
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]