brancz commented on code in PR #7713: URL: https://github.com/apache/arrow-rs/pull/7713#discussion_r2280383120
########## arrow-cast/src/cast/run_array.rs: ########## @@ -0,0 +1,269 @@ +use crate::cast::*; + +/// Attempts to cast a Run-End Encoded array to another type, handling both REE-to-REE +/// and REE-to-other type conversions with proper validation and error handling. +/// +/// # Arguments +/// * `array` - The input Run-End Encoded array to be cast +/// * `to_type` - The target data type for the casting operation +/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs unsafe) +/// +/// # Returns +/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting fails +/// +/// # Behavior +/// This function handles two main casting scenarios: +/// +/// ## Case 1: REE-to-REE Casting +/// When casting to another Run-End Encoded type: +/// - Casts both the `values` and `run_ends` to their target types +/// - Validates that run-end casting only allows upcasts (Int16→Int32, Int16→Int64, Int32→Int64) +/// - Preserves the REE structure while updating both fields +/// - Returns a new `RunArray` with the appropriate run-end type (Int16, Int32, or Int64) +/// +/// ## Case 2: REE-to-Other Casting +/// When casting to a non-REE type: +/// - Expands the REE array to its logical form by unpacking all values +/// - Applies the target type casting to the expanded array +/// - Returns a regular array of the target type (e.g., StringArray, Int64Array) +/// +/// # Error Handling, error occurs if: +/// - the input array is not a Run-End Encoded array +/// - run-end downcasting would cause overflow +/// - the target run-end type is unsupported +/// - Propagates errors from underlying casting operations +/// +/// # Safety Considerations +/// - Run-end casting uses `safe: false` to prevent silent overflow +/// - Only upcasts are allowed for run-ends to maintain valid REE structure +/// - Unpacking preserves null values and array length +/// - Type validation ensures only supported run-end types (Int16, Int32, Int64) +/// +/// # Performance Notes +/// - REE-to-REE casting is efficient as it operates on the compressed structure +/// - REE-to-other casting requires full unpacking, which may be expensive for large arrays +/// - Run-end validation adds minimal overhead for safety +pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>( + array: &dyn Array, + to_type: &DataType, + cast_options: &CastOptions, +) -> Result<ArrayRef, ArrowError> { + match array.data_type() { + DataType::RunEndEncoded(_, _) => { + let run_array = array + .as_any() + .downcast_ref::<RunArray<K>>() + .ok_or_else(|| ArrowError::CastError("Expected RunArray".to_string()))?; + + let values = run_array.values(); + + match to_type { + // CASE 1: Stay as RunEndEncoded, cast only the values + DataType::RunEndEncoded(target_index_field, target_value_field) => { + let cast_values = + cast_with_options(values, target_value_field.data_type(), cast_options)?; + + let run_ends_array = PrimitiveArray::<K>::from_iter_values( + run_array.run_ends().values().iter().copied(), + ); + let cast_run_ends = cast_with_options( + &run_ends_array, + target_index_field.data_type(), + cast_options, + )?; + let new_run_array: ArrayRef = match target_index_field.data_type() { + DataType::Int16 => { + let re = cast_run_ends.as_primitive::<Int16Type>(); + Arc::new(RunArray::<Int16Type>::try_new(re, cast_values.as_ref())?) + } + DataType::Int32 => { + let re = cast_run_ends.as_primitive::<Int32Type>(); + Arc::new(RunArray::<Int32Type>::try_new(re, cast_values.as_ref())?) + } + DataType::Int64 => { + let re = cast_run_ends.as_primitive::<Int64Type>(); + Arc::new(RunArray::<Int64Type>::try_new(re, cast_values.as_ref())?) + } + _ => { + return Err(ArrowError::CastError( + "Run-end type must be i16, i32, or i64".to_string(), + )) + } + }; + Ok(Arc::new(new_run_array)) + } + + // CASE 2: Expand to logical form + _ => { + let total_len = run_array.len(); + let indices = Int32Array::from_iter_values( + (0..total_len).map(|i| run_array.get_physical_index(i) as i32), + ); + + let taken = take(values.as_ref(), &indices, None)?; + + if taken.data_type() != to_type { + cast_with_options(taken.as_ref(), to_type, cast_options) + } else { + Ok(taken) + } + } + } + } + + _ => Err(ArrowError::CastError(format!( + "Cannot cast array of type {:?} to RunEndEncodedArray", + array.data_type() + ))), + } +} + +/// Attempts to cast an array to a RunEndEncoded array with the specified index type K +/// and value type. This function performs run-end encoding on the input array. +/// +/// # Arguments +/// * `array` - The input array to be run-end encoded +/// * `value_type` - The target data type for the values in the RunEndEncoded array +/// * `cast_options` - Options controlling the casting behavior +/// +/// # Returns +/// A `Result` containing the new `RunArray` or an `ArrowError` if casting fails +/// +/// # Process +/// 1. Cast the input array to the target value type if needed +/// 2. Iterate through the array to identify runs of consecutive equal values +/// 3. Build run_ends array indicating where each run terminates +/// 4. Build values array containing the unique values for each run +/// 5. Construct and return the RunArray +pub(crate) fn cast_to_run_end_encoded<K: RunEndIndexType>( + array: &dyn Array, + value_type: &DataType, + cast_options: &CastOptions, +) -> Result<ArrayRef, ArrowError> { + // Step 1: Cast the input array to the target value type if necessary + let cast_array = if array.data_type() == value_type { + // No casting needed, use the array as-is + make_array(array.to_data()) Review Comment: this copy is not necessary, you can replace this with just `array` and then use `&cast_with_options(array, value_type, cast_options)?` in the other branch ########## arrow-cast/src/cast/run_array.rs: ########## @@ -0,0 +1,269 @@ +use crate::cast::*; + +/// Attempts to cast a Run-End Encoded array to another type, handling both REE-to-REE +/// and REE-to-other type conversions with proper validation and error handling. +/// +/// # Arguments +/// * `array` - The input Run-End Encoded array to be cast +/// * `to_type` - The target data type for the casting operation +/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs unsafe) +/// +/// # Returns +/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting fails +/// +/// # Behavior +/// This function handles two main casting scenarios: +/// +/// ## Case 1: REE-to-REE Casting +/// When casting to another Run-End Encoded type: +/// - Casts both the `values` and `run_ends` to their target types +/// - Validates that run-end casting only allows upcasts (Int16→Int32, Int16→Int64, Int32→Int64) +/// - Preserves the REE structure while updating both fields +/// - Returns a new `RunArray` with the appropriate run-end type (Int16, Int32, or Int64) +/// +/// ## Case 2: REE-to-Other Casting +/// When casting to a non-REE type: +/// - Expands the REE array to its logical form by unpacking all values +/// - Applies the target type casting to the expanded array +/// - Returns a regular array of the target type (e.g., StringArray, Int64Array) +/// +/// # Error Handling, error occurs if: +/// - the input array is not a Run-End Encoded array +/// - run-end downcasting would cause overflow +/// - the target run-end type is unsupported +/// - Propagates errors from underlying casting operations +/// +/// # Safety Considerations +/// - Run-end casting uses `safe: false` to prevent silent overflow +/// - Only upcasts are allowed for run-ends to maintain valid REE structure +/// - Unpacking preserves null values and array length +/// - Type validation ensures only supported run-end types (Int16, Int32, Int64) +/// +/// # Performance Notes +/// - REE-to-REE casting is efficient as it operates on the compressed structure +/// - REE-to-other casting requires full unpacking, which may be expensive for large arrays +/// - Run-end validation adds minimal overhead for safety +pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>( + array: &dyn Array, + to_type: &DataType, + cast_options: &CastOptions, +) -> Result<ArrayRef, ArrowError> { + match array.data_type() { + DataType::RunEndEncoded(_, _) => { + let run_array = array + .as_any() + .downcast_ref::<RunArray<K>>() + .ok_or_else(|| ArrowError::CastError("Expected RunArray".to_string()))?; + + let values = run_array.values(); + + match to_type { + // CASE 1: Stay as RunEndEncoded, cast only the values + DataType::RunEndEncoded(target_index_field, target_value_field) => { + let cast_values = + cast_with_options(values, target_value_field.data_type(), cast_options)?; + + let run_ends_array = PrimitiveArray::<K>::from_iter_values( + run_array.run_ends().values().iter().copied(), + ); + let cast_run_ends = cast_with_options( + &run_ends_array, + target_index_field.data_type(), + cast_options, + )?; + let new_run_array: ArrayRef = match target_index_field.data_type() { + DataType::Int16 => { + let re = cast_run_ends.as_primitive::<Int16Type>(); + Arc::new(RunArray::<Int16Type>::try_new(re, cast_values.as_ref())?) + } + DataType::Int32 => { + let re = cast_run_ends.as_primitive::<Int32Type>(); + Arc::new(RunArray::<Int32Type>::try_new(re, cast_values.as_ref())?) + } + DataType::Int64 => { + let re = cast_run_ends.as_primitive::<Int64Type>(); + Arc::new(RunArray::<Int64Type>::try_new(re, cast_values.as_ref())?) + } + _ => { + return Err(ArrowError::CastError( + "Run-end type must be i16, i32, or i64".to_string(), + )) + } + }; + Ok(Arc::new(new_run_array)) + } + + // CASE 2: Expand to logical form + _ => { + let total_len = run_array.len(); + let indices = Int32Array::from_iter_values( + (0..total_len).map(|i| run_array.get_physical_index(i) as i32), + ); + + let taken = take(values.as_ref(), &indices, None)?; + + if taken.data_type() != to_type { + cast_with_options(taken.as_ref(), to_type, cast_options) + } else { + Ok(taken) + } + } + } + } + + _ => Err(ArrowError::CastError(format!( + "Cannot cast array of type {:?} to RunEndEncodedArray", + array.data_type() + ))), + } +} + +/// Attempts to cast an array to a RunEndEncoded array with the specified index type K +/// and value type. This function performs run-end encoding on the input array. +/// +/// # Arguments +/// * `array` - The input array to be run-end encoded +/// * `value_type` - The target data type for the values in the RunEndEncoded array +/// * `cast_options` - Options controlling the casting behavior +/// +/// # Returns +/// A `Result` containing the new `RunArray` or an `ArrowError` if casting fails +/// +/// # Process +/// 1. Cast the input array to the target value type if needed +/// 2. Iterate through the array to identify runs of consecutive equal values +/// 3. Build run_ends array indicating where each run terminates +/// 4. Build values array containing the unique values for each run +/// 5. Construct and return the RunArray +pub(crate) fn cast_to_run_end_encoded<K: RunEndIndexType>( + array: &dyn Array, + value_type: &DataType, + cast_options: &CastOptions, +) -> Result<ArrayRef, ArrowError> { + // Step 1: Cast the input array to the target value type if necessary + let cast_array = if array.data_type() == value_type { + // No casting needed, use the array as-is + make_array(array.to_data()) + } else { + // Cast to the target value type + cast_with_options(array, value_type, cast_options)? + }; + + // Step 2: Run-end encode the cast array + // We'll use a builder to construct the RunArray efficiently + let mut run_ends_builder = PrimitiveBuilder::<K>::new(); + + if cast_array.len() == 0 { + // Handle empty array case + let empty_run_ends = run_ends_builder.finish(); + let empty_values = make_array(ArrayData::new_empty(value_type)); + return Ok(Arc::new(RunArray::<K>::try_new( + &empty_run_ends, + empty_values.as_ref(), + )?)); + } + + // Create a temporary builder to construct the run array + // We'll iterate through and build runs by comparing adjacent elements + let mut run_ends_vec = Vec::new(); + let mut values_indices = Vec::new(); + + // Add the first element as the start of the first run + values_indices.push(0); + // Step 3: Identify runs of consecutive equal values + for i in 1..cast_array.len() { + // We can afford to perform the simple comparison here as we already validated the type in [can_cast_run_end_encoded] + let values_equal = match (cast_array.is_null(i), cast_array.is_null(i - 1)) { + (true, true) => true, // Both null + (false, false) => { + // Both non-null - use slice comparison as a basic approach + // This is a simplified implementation Review Comment: As AIs like claude code like to do things like this comment (and frankly, various other comments and the PR description) and the simplified code below, it really makes me think this entire PR was AI-generated. If this is the case, please at least state this in the PR description. ########## arrow-cast/src/cast/run_array.rs: ########## @@ -0,0 +1,269 @@ +use crate::cast::*; + +/// Attempts to cast a Run-End Encoded array to another type, handling both REE-to-REE +/// and REE-to-other type conversions with proper validation and error handling. +/// +/// # Arguments +/// * `array` - The input Run-End Encoded array to be cast +/// * `to_type` - The target data type for the casting operation +/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs unsafe) +/// +/// # Returns +/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting fails +/// +/// # Behavior +/// This function handles two main casting scenarios: +/// +/// ## Case 1: REE-to-REE Casting +/// When casting to another Run-End Encoded type: +/// - Casts both the `values` and `run_ends` to their target types +/// - Validates that run-end casting only allows upcasts (Int16→Int32, Int16→Int64, Int32→Int64) +/// - Preserves the REE structure while updating both fields +/// - Returns a new `RunArray` with the appropriate run-end type (Int16, Int32, or Int64) +/// +/// ## Case 2: REE-to-Other Casting +/// When casting to a non-REE type: +/// - Expands the REE array to its logical form by unpacking all values +/// - Applies the target type casting to the expanded array +/// - Returns a regular array of the target type (e.g., StringArray, Int64Array) +/// +/// # Error Handling, error occurs if: +/// - the input array is not a Run-End Encoded array +/// - run-end downcasting would cause overflow +/// - the target run-end type is unsupported +/// - Propagates errors from underlying casting operations +/// +/// # Safety Considerations +/// - Run-end casting uses `safe: false` to prevent silent overflow +/// - Only upcasts are allowed for run-ends to maintain valid REE structure +/// - Unpacking preserves null values and array length +/// - Type validation ensures only supported run-end types (Int16, Int32, Int64) +/// +/// # Performance Notes +/// - REE-to-REE casting is efficient as it operates on the compressed structure +/// - REE-to-other casting requires full unpacking, which may be expensive for large arrays +/// - Run-end validation adds minimal overhead for safety +pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>( + array: &dyn Array, + to_type: &DataType, + cast_options: &CastOptions, +) -> Result<ArrayRef, ArrowError> { + match array.data_type() { + DataType::RunEndEncoded(_, _) => { + let run_array = array + .as_any() + .downcast_ref::<RunArray<K>>() + .ok_or_else(|| ArrowError::CastError("Expected RunArray".to_string()))?; + + let values = run_array.values(); + + match to_type { + // CASE 1: Stay as RunEndEncoded, cast only the values + DataType::RunEndEncoded(target_index_field, target_value_field) => { + let cast_values = + cast_with_options(values, target_value_field.data_type(), cast_options)?; + + let run_ends_array = PrimitiveArray::<K>::from_iter_values( + run_array.run_ends().values().iter().copied(), + ); + let cast_run_ends = cast_with_options( + &run_ends_array, + target_index_field.data_type(), + cast_options, + )?; + let new_run_array: ArrayRef = match target_index_field.data_type() { + DataType::Int16 => { + let re = cast_run_ends.as_primitive::<Int16Type>(); + Arc::new(RunArray::<Int16Type>::try_new(re, cast_values.as_ref())?) + } + DataType::Int32 => { + let re = cast_run_ends.as_primitive::<Int32Type>(); + Arc::new(RunArray::<Int32Type>::try_new(re, cast_values.as_ref())?) + } + DataType::Int64 => { + let re = cast_run_ends.as_primitive::<Int64Type>(); + Arc::new(RunArray::<Int64Type>::try_new(re, cast_values.as_ref())?) + } + _ => { + return Err(ArrowError::CastError( + "Run-end type must be i16, i32, or i64".to_string(), + )) + } + }; + Ok(Arc::new(new_run_array)) + } + + // CASE 2: Expand to logical form + _ => { + let total_len = run_array.len(); + let indices = Int32Array::from_iter_values( + (0..total_len).map(|i| run_array.get_physical_index(i) as i32), + ); + + let taken = take(values.as_ref(), &indices, None)?; + + if taken.data_type() != to_type { + cast_with_options(taken.as_ref(), to_type, cast_options) + } else { + Ok(taken) + } + } + } + } + + _ => Err(ArrowError::CastError(format!( + "Cannot cast array of type {:?} to RunEndEncodedArray", + array.data_type() + ))), + } +} + +/// Attempts to cast an array to a RunEndEncoded array with the specified index type K +/// and value type. This function performs run-end encoding on the input array. +/// +/// # Arguments +/// * `array` - The input array to be run-end encoded +/// * `value_type` - The target data type for the values in the RunEndEncoded array +/// * `cast_options` - Options controlling the casting behavior +/// +/// # Returns +/// A `Result` containing the new `RunArray` or an `ArrowError` if casting fails +/// +/// # Process +/// 1. Cast the input array to the target value type if needed +/// 2. Iterate through the array to identify runs of consecutive equal values +/// 3. Build run_ends array indicating where each run terminates +/// 4. Build values array containing the unique values for each run +/// 5. Construct and return the RunArray +pub(crate) fn cast_to_run_end_encoded<K: RunEndIndexType>( + array: &dyn Array, + value_type: &DataType, + cast_options: &CastOptions, +) -> Result<ArrayRef, ArrowError> { + // Step 1: Cast the input array to the target value type if necessary + let cast_array = if array.data_type() == value_type { + // No casting needed, use the array as-is + make_array(array.to_data()) + } else { + // Cast to the target value type + cast_with_options(array, value_type, cast_options)? + }; + + // Step 2: Run-end encode the cast array + // We'll use a builder to construct the RunArray efficiently + let mut run_ends_builder = PrimitiveBuilder::<K>::new(); + + if cast_array.len() == 0 { + // Handle empty array case + let empty_run_ends = run_ends_builder.finish(); + let empty_values = make_array(ArrayData::new_empty(value_type)); + return Ok(Arc::new(RunArray::<K>::try_new( + &empty_run_ends, + empty_values.as_ref(), + )?)); + } + + // Create a temporary builder to construct the run array + // We'll iterate through and build runs by comparing adjacent elements + let mut run_ends_vec = Vec::new(); + let mut values_indices = Vec::new(); + + // Add the first element as the start of the first run + values_indices.push(0); + // Step 3: Identify runs of consecutive equal values + for i in 1..cast_array.len() { + // We can afford to perform the simple comparison here as we already validated the type in [can_cast_run_end_encoded] + let values_equal = match (cast_array.is_null(i), cast_array.is_null(i - 1)) { + (true, true) => true, // Both null + (false, false) => { + // Both non-null - use slice comparison as a basic approach + // This is a simplified implementation + cast_array.slice(i, 1).to_data() == cast_array.slice(i - 1, 1).to_data() Review Comment: I realize this is simple to do this way, but this causes every element to be copied twice for every cast. I think much like with dictionary casting, it's unrealistic to support arbitrarily nested types (just like the `can_cast_to_run_end_encoded` correctly returns already), but for all non-nested types we should have specific implementations for primitive/decimals/byte/utf types. Then it's also trivial to do zero-copy comparisons for those types. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org