HaoYang670 commented on a change in pull request #1512:
URL: https://github.com/apache/arrow-rs/pull/1512#discussion_r839625876



##########
File path: arrow/src/compute/kernels/substring.rs
##########
@@ -24,56 +24,74 @@ use crate::{
     error::{ArrowError, Result},
 };
 
-#[allow(clippy::unnecessary_wraps)]
 fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
     array: &GenericStringArray<OffsetSize>,
     start: OffsetSize,
     length: &Option<OffsetSize>,
 ) -> Result<ArrayRef> {
-    // compute current offsets
-    let offsets = array.data_ref().clone().buffers()[0].clone();
-    let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() };
-
-    // compute null bitmap (copy)
+    let offsets = array.value_offsets();
     let null_bit_buffer = array.data_ref().null_buffer().cloned();
-
-    // compute values
-    let values = &array.data_ref().buffers()[1];
+    let values = array.value_data();
     let data = values.as_slice();
+    let zero = OffsetSize::zero();
 
-    let mut new_values = MutableBuffer::new(0); // we have no way to estimate 
how much this will be.
-    let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
-
-    let mut length_so_far = OffsetSize::zero();
-    new_offsets.push(length_so_far);
-    (0..array.len()).for_each(|i| {
-        // the length of this entry
-        let length_i: OffsetSize = offsets[i + 1] - offsets[i];
-        // compute where we should start slicing this entry
-        let start = offsets[i]
-            + if start >= OffsetSize::zero() {
-                start
-            } else {
-                length_i + start
-            };
-
-        let start = start.max(offsets[i]).min(offsets[i + 1]);
-        // compute the length of the slice
-        let length: OffsetSize = length
-            .unwrap_or(length_i)
-            // .max(0) is not needed as it is guaranteed
-            .min(offsets[i + 1] - start); // so we do not go beyond this entry
-
-        length_so_far += length;
+    // calculate the start offset for each substring
+    // if `start` >= 0
+    // then, count from the start of each string
+    // else, count from the end of each string
+    let new_starts: Vec<OffsetSize> = if start >= zero {
+        offsets
+            .windows(2)
+            .map(|pair| (pair[0] + start).min(pair[1]))
+            .collect()
+    } else {
+        offsets
+            .windows(2)
+            .map(|pair| (pair[1] + start).max(pair[0]))
+            .collect()

Review comment:
       I tried to remove `.collect`, actually. But you know, closure values in 
`if-else` branch can get `no two closures, even if identical, have the same 
type` error. And I did not find a good solution except adding `.collect`
   https://github.com/rust-lang/rust/issues/87961




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to