HaoYang670 commented on a change in pull request #1512: URL: https://github.com/apache/arrow-rs/pull/1512#discussion_r840187012
########## File path: arrow/src/compute/kernels/substring.rs ########## @@ -24,56 +24,74 @@ use crate::{ error::{ArrowError, Result}, }; -#[allow(clippy::unnecessary_wraps)] fn generic_substring<OffsetSize: StringOffsetSizeTrait>( array: &GenericStringArray<OffsetSize>, start: OffsetSize, length: &Option<OffsetSize>, ) -> Result<ArrayRef> { - // compute current offsets - let offsets = array.data_ref().clone().buffers()[0].clone(); - let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() }; - - // compute null bitmap (copy) + let offsets = array.value_offsets(); let null_bit_buffer = array.data_ref().null_buffer().cloned(); - - // compute values - let values = &array.data_ref().buffers()[1]; + let values = array.value_data(); let data = values.as_slice(); + let zero = OffsetSize::zero(); - let mut new_values = MutableBuffer::new(0); // we have no way to estimate how much this will be. - let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1); - - let mut length_so_far = OffsetSize::zero(); - new_offsets.push(length_so_far); - (0..array.len()).for_each(|i| { - // the length of this entry - let length_i: OffsetSize = offsets[i + 1] - offsets[i]; - // compute where we should start slicing this entry - let start = offsets[i] - + if start >= OffsetSize::zero() { - start - } else { - length_i + start - }; - - let start = start.max(offsets[i]).min(offsets[i + 1]); - // compute the length of the slice - let length: OffsetSize = length - .unwrap_or(length_i) - // .max(0) is not needed as it is guaranteed - .min(offsets[i + 1] - start); // so we do not go beyond this entry - - length_so_far += length; + // calculate the start offset for each substring + // if `start` >= 0 + // then, count from the start of each string + // else, count from the end of each string + let new_starts: Vec<OffsetSize> = if start >= zero { + offsets + .windows(2) + .map(|pair| (pair[0] + start).min(pair[1])) + .collect() + } else { + offsets + .windows(2) + .map(|pair| (pair[1] + start).max(pair[0])) + .collect() Review comment: > Yes, that is not possible in Rust (I think only by Boxing or manually inlining / macros). > > Calculating the length in two places with something like `let length = *end - *start` within both the `new_offsets` `new_values` doesn´t seem too bad in terms of duplication? One way is to let the closure not capture the environment, and the compiler will downcast closure to function. This works for calculating `new_starts`, but doesn't work for `new_length` where we must capture `Some(length) = length` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org