HaoYang670 commented on a change in pull request #1512:
URL: https://github.com/apache/arrow-rs/pull/1512#discussion_r840139836



##########
File path: arrow/src/compute/kernels/substring.rs
##########
@@ -24,56 +24,74 @@ use crate::{
     error::{ArrowError, Result},
 };
 
-#[allow(clippy::unnecessary_wraps)]
 fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
     array: &GenericStringArray<OffsetSize>,
     start: OffsetSize,
     length: &Option<OffsetSize>,
 ) -> Result<ArrayRef> {
-    // compute current offsets
-    let offsets = array.data_ref().clone().buffers()[0].clone();
-    let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() };
-
-    // compute null bitmap (copy)
+    let offsets = array.value_offsets();
     let null_bit_buffer = array.data_ref().null_buffer().cloned();
-
-    // compute values
-    let values = &array.data_ref().buffers()[1];
+    let values = array.value_data();
     let data = values.as_slice();
+    let zero = OffsetSize::zero();
 
-    let mut new_values = MutableBuffer::new(0); // we have no way to estimate 
how much this will be.
-    let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
-
-    let mut length_so_far = OffsetSize::zero();
-    new_offsets.push(length_so_far);
-    (0..array.len()).for_each(|i| {
-        // the length of this entry
-        let length_i: OffsetSize = offsets[i + 1] - offsets[i];
-        // compute where we should start slicing this entry
-        let start = offsets[i]
-            + if start >= OffsetSize::zero() {
-                start
-            } else {
-                length_i + start
-            };
-
-        let start = start.max(offsets[i]).min(offsets[i + 1]);
-        // compute the length of the slice
-        let length: OffsetSize = length
-            .unwrap_or(length_i)
-            // .max(0) is not needed as it is guaranteed
-            .min(offsets[i + 1] - start); // so we do not go beyond this entry
-
-        length_so_far += length;
+    // calculate the start offset for each substring
+    // if `start` >= 0
+    // then, count from the start of each string
+    // else, count from the end of each string
+    let new_starts: Vec<OffsetSize> = if start >= zero {
+        offsets
+            .windows(2)
+            .map(|pair| (pair[0] + start).min(pair[1]))
+            .collect()
+    } else {
+        offsets
+            .windows(2)
+            .map(|pair| (pair[1] + start).max(pair[0]))
+            .collect()

Review comment:
       > Might be better to do `new_starts`/`new_length` in one iteration over 
`offsets` and use `unzip` to materialize into two Vecs?
   
   I guess it will not make the code simple. Because there are 4 branches that 
has to be considered:
   ```rust
   let (new_starts, new_length) = match (start>=0, length) {
       (true, Some(length)) => ...,
       (true, None) => ...,
       (false, Some(length)) => ,
       (false, None) =>,
   };
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to