XiangpengHao commented on code in PR #12044:
URL: https://github.com/apache/datafusion/pull/12044#discussion_r1720863104


##########
datafusion/functions/src/unicode/substr.rs:
##########
@@ -107,11 +112,170 @@ pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-/// Extracts the substring of string starting at the start'th character, and 
extending for count characters if that is specified. (Same as substring(string 
from start for count).)
-/// substr('alphabet', 3) = 'phabet'
-/// substr('alphabet', 3, 2) = 'ph'
-/// The implementation uses UTF-8 code points as characters
-fn calculate_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) -> 
Result<ArrayRef>
+// Return the exact byte index for [start, end), set count to -1 to ignore 
count
+fn get_true_start_count(input: &str, start: usize, count: i64) -> (usize, 
usize) {
+    let (mut st, mut ed) = (input.len(), input.len());
+    let mut start_counting = false;
+    let mut cnt = 0;
+    for (char_cnt, (byte_cnt, _)) in input.char_indices().enumerate() {
+        if char_cnt == start {
+            st = byte_cnt;
+            if count != -1 {
+                start_counting = true;
+            } else {
+                break;
+            }
+        }
+        if start_counting {
+            if cnt == count {
+                ed = byte_cnt;
+                break;
+            }
+            cnt += 1;
+        }
+    }
+    (st, ed)
+}
+
+// The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44
+// From<u128> for ByteView
+fn calculate_string_view(
+    string_array: &StringViewArray,
+    args: &[ArrayRef],
+) -> Result<ArrayRef> {
+    let mut builder = StringViewBuilder::new();
+    // Copy all blocks from input
+    for block in string_array.data_buffers() {
+        builder.append_block(block.clone());
+    }
+
+    let start_array = as_int64_array(&args[0])?;
+
+    match args.len() {
+        1 => {
+            for (idx, (raw, start)) in string_array
+                .views()
+                .iter()
+                .zip(start_array.iter())
+                .enumerate()
+            {
+                if let Some(start) = start {
+                    let length = *raw as u32;
+                    let start = (start - 1).max(0);
+
+                    // Operate according to the length of bytes
+                    if length == 0 {
+                        builder.append_null();
+                    } else if length > 12 {
+                        let buffer_index = (*raw >> 64) as u32;
+                        let offset = (*raw >> 96) as u32;
+                        let str = string_array.value(idx);
+                        let (start, end) = get_true_start_count(str, start as 
usize, -1);
+                        // Safety: builder is guaranteed to have corresponding 
blocks
+                        unsafe {
+                            builder.append_view_unchecked(
+                                buffer_index,
+                                offset + start as u32,
+                                // guarantee that end-offset >= 0 for end <= 
str.len()
+                                (end - start) as u32,
+                            );
+                        }
+                    } else {
+                        let bytes = ((*raw >> 32) & u128::MAX).to_le_bytes();
+                        let str = match std::str::from_utf8(&bytes[..length as 
usize]) {
+                            Ok(str) => {
+                                // Extract str[start, end) by char
+                                let (start, end) = get_true_start_count(
+                                    str,
+                                    start as usize,
+                                    length as i64,
+                                );
+                                &str[start..end]
+                            }
+                            _ => {
+                                return exec_err!(
+                                    "failed to convert inline bytes to &str."
+                                )
+                            }
+                        };
+                        builder.append_value(str);
+                    }
+                } else {
+                    builder.append_null();
+                }
+            }
+        }
+        2 => {
+            let count_array = as_int64_array(&args[1])?;
+            for (idx, ((raw, start), count)) in string_array
+                .views()
+                .iter()
+                .zip(start_array.iter())
+                .zip(count_array.iter())
+                .enumerate()
+            {
+                if let (Some(start), Some(count)) = (start, count) {
+                    let length = *raw as u32;
+                    let start = (start - 1).max(0) as usize;

Review Comment:
   ```suggestion
                       let start = (start as usize).saturating_sub(1);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to