Jefffrey commented on code in PR #8716:
URL: https://github.com/apache/arrow-rs/pull/8716#discussion_r3408932281
##########
arrow-cast/src/cast/run_array.rs:
##########
@@ -169,3 +171,396 @@ pub(crate) fn cast_to_run_end_encoded<K: RunEndIndexType>(
let run_array = RunArray::<K>::try_new(&run_ends_array,
values_array.as_ref())?;
Ok(Arc::new(run_array))
}
+
+fn compute_run_boundaries(array: &ArrayRef) -> (Vec<usize>, Vec<usize>) {
+ if array.is_empty() {
+ return (Vec::new(), Vec::new());
+ }
+
+ use arrow_schema::DataType::*;
+
+ let array = array.as_ref();
+ downcast_primitive_array! {
+ array => runs_for_primitive(array),
+ Null => (vec![array.len()], vec![0]),
+ Boolean => runs_for_boolean(array.as_boolean()),
+ Utf8 => runs_for_string(array.as_string::<i32>()),
+ LargeUtf8 => runs_for_string(array.as_string::<i64>()),
+ Binary => runs_for_binary(array.as_binary::<i32>()),
+ LargeBinary => runs_for_binary(array.as_binary::<i64>()),
+ FixedSizeBinary(_) =>
runs_for_fixed_size_binary(array.as_fixed_size_binary()),
+ Dictionary(key_type, _) => match key_type.as_ref() {
+ Int8 => runs_for_dictionary::<Int8Type>(array.as_dictionary()),
+ Int16 => runs_for_dictionary::<Int16Type>(array.as_dictionary()),
+ Int32 => runs_for_dictionary::<Int32Type>(array.as_dictionary()),
+ Int64 => runs_for_dictionary::<Int64Type>(array.as_dictionary()),
+ UInt8 => runs_for_dictionary::<UInt8Type>(array.as_dictionary()),
+ UInt16 => runs_for_dictionary::<UInt16Type>(array.as_dictionary()),
+ UInt32 => runs_for_dictionary::<UInt32Type>(array.as_dictionary()),
+ UInt64 => runs_for_dictionary::<UInt64Type>(array.as_dictionary()),
+ _ => runs_generic(array),
+ },
+ _ => runs_generic(array),
+ }
+}
+
+fn runs_for_boolean(array: &BooleanArray) -> (Vec<usize>, Vec<usize>) {
+ let len = array.len();
+ if let Some(runs) = trivial_runs(len) {
+ return runs;
+ }
+
+ let mut run_boundaries = Vec::with_capacity(len / 64 + 2);
+ let mut current_valid = array.is_valid(0);
+ let mut current_value = if current_valid { array.value(0) } else { false };
+
+ for idx in 1..len {
+ // Treat a change in validity the same as a change in value so null
boundaries are recorded.
+ let valid = array.is_valid(idx);
+ let mut boundary = false;
+ if current_valid && valid {
+ let value = array.value(idx);
+ if value != current_value {
+ current_value = value;
+ boundary = true;
+ }
+ } else if current_valid != valid {
+ boundary = true;
+ if valid {
+ current_value = array.value(idx);
+ }
+ }
+
+ if boundary {
+ ensure_capacity(&mut run_boundaries, len);
+ run_boundaries.push(idx);
+ }
+ current_valid = valid;
+ }
+
+ finalize_runs(run_boundaries, len)
+}
+
+fn runs_for_primitive<T: ArrowPrimitiveType>(
+ array: &PrimitiveArray<T>,
+) -> (Vec<usize>, Vec<usize>) {
+ let len = array.len();
+ if let Some(runs) = trivial_runs(len) {
+ return runs;
+ }
+
+ let values = array.values();
+ let mut run_boundaries = Vec::with_capacity(len / 64 + 2);
+
+ if array.null_count() == 0 {
+ let mut current = unsafe { *values.get_unchecked(0) };
+ let mut idx = 1;
+ while idx < len {
+ // Attempt to advance in 16-byte chunks before falling back to
scalar comparison.
+ let boundary = scan_run_end::<T>(values, current, idx);
+ if boundary == len {
+ break;
+ }
+ ensure_capacity(&mut run_boundaries, len);
+ run_boundaries.push(boundary);
+ current = unsafe { *values.get_unchecked(boundary) };
+ idx = boundary + 1;
+ }
+ return finalize_runs(run_boundaries, len);
+ }
+
+ let nulls = array
+ .nulls()
+ .expect("null_count > 0 implies a null buffer is present");
+ let mut current_valid = nulls.is_valid(0);
+ let mut current_value = unsafe { *values.get_unchecked(0) };
+ for idx in 1..len {
+ let valid = nulls.is_valid(idx);
+ let mut boundary = false;
+ if current_valid && valid {
+ let value = unsafe { *values.get_unchecked(idx) };
+ if value != current_value {
+ current_value = value;
+ boundary = true;
+ }
+ } else if current_valid != valid {
+ boundary = true;
+ if valid {
+ current_value = unsafe { *values.get_unchecked(idx) };
+ }
+ }
+ if boundary {
+ ensure_capacity(&mut run_boundaries, len);
+ run_boundaries.push(idx);
+ }
+ current_valid = valid;
+ }
+ finalize_runs(run_boundaries, len)
+}
+
+fn runs_for_binary<O: OffsetSizeTrait>(array: &GenericBinaryArray<O>) ->
(Vec<usize>, Vec<usize>) {
+ let mut to_usize = |v: O| v.as_usize();
+ runs_for_binary_like(
+ array.len(),
+ array.null_count(),
+ array.value_offsets(),
+ array.value_data(),
+ |idx| array.is_valid(idx),
+ &mut to_usize,
+ )
+}
+
+fn runs_for_binary_like<T: Copy>(
+ len: usize,
+ null_count: usize,
+ offsets: &[T],
+ values: &[u8],
+ mut is_valid: impl FnMut(usize) -> bool,
+ to_usize: &mut impl FnMut(T) -> usize,
+) -> (Vec<usize>, Vec<usize>) {
+ if let Some(runs) = trivial_runs(len) {
+ return runs;
+ }
+
Review Comment:
We can combine the binary+string methods like so:
```rust
fn runs_for_bytes<O: ByteArrayType>(array: &GenericByteArray<O>) ->
(Vec<usize>, Vec<usize>) {
let len = array.len();
let null_count = array.null_count();
let offsets = array.value_offsets();
let values = array.value_data();
// rest of runs_for_binary_like()
```
- made it generic over `GenericByteArray` which is both for strings and
binary arrays
means we can then use like so, removing need for `runs_for_binary()` and
`runs_for_string()`
```rust
Utf8 => runs_for_bytes(array.as_string::<i32>()),
LargeUtf8 => runs_for_bytes(array.as_string::<i64>()),
Binary => runs_for_bytes(array.as_binary::<i32>()),
LargeBinary => runs_for_bytes(array.as_binary::<i64>()),
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]