Re: [PR] Improve StringView support for SUBSTR [datafusion]

via GitHub Wed, 21 Aug 2024 12:46:00 -0700


alamb commented on code in PR #12044:
URL: https://github.com/apache/datafusion/pull/12044#discussion_r1725668538



##########
datafusion/functions/src/unicode/substr.rs:
##########
@@ -89,29 +94,193 @@ impl ScalarUDFImpl for SubstrFunc {
     }
 }
 
+/// Extracts the substring of string starting at the start'th character, and 
extending for count characters if that is specified. (Same as substring(string 
from start for count).)
+/// substr('alphabet', 3) = 'phabet'
+/// substr('alphabet', 3, 2) = 'ph'
+/// The implementation uses UTF-8 code points as characters
 pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
-            calculate_substr::<_, i32>(string_array, &args[1..])
+            string_substr::<_, i32>(string_array, &args[1..])
         }
         DataType::LargeUtf8 => {
             let string_array = args[0].as_string::<i64>();
-            calculate_substr::<_, i64>(string_array, &args[1..])
+            string_substr::<_, i64>(string_array, &args[1..])
         }
         DataType::Utf8View => {
             let string_array = args[0].as_string_view();
-            calculate_substr::<_, i32>(string_array, &args[1..])
+            string_view_substr(string_array, &args[1..])
         }
-        other => exec_err!("Unsupported data type {other:?} for function 
substr"),
+        other => exec_err!(
+            "Unsupported data type {other:?} for function substr,\
+            expected Utf8View, Utf8 or LargeUtf8."
+        ),
     }
 }
 
-/// Extracts the substring of string starting at the start'th character, and 
extending for count characters if that is specified. (Same as substring(string 
from start for count).)
-/// substr('alphabet', 3) = 'phabet'
-/// substr('alphabet', 3, 2) = 'ph'
-/// The implementation uses UTF-8 code points as characters
-fn calculate_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) -> 
Result<ArrayRef>
+// Return the exact byte index for [start, end), set count to -1 to ignore 
count
+fn get_true_start_count(input: &str, start: usize, count: i64) -> (usize, 
usize) {
+    let (mut st, mut ed) = (input.len(), input.len());
+    let mut start_counting = false;
+    let mut cnt = 0;
+    for (char_cnt, (byte_cnt, _)) in input.char_indices().enumerate() {
+        if char_cnt == start {
+            st = byte_cnt;
+            if count != -1 {
+                start_counting = true;
+            } else {
+                break;
+            }
+        }
+        if start_counting {
+            if cnt == count {
+                ed = byte_cnt;
+                break;
+            }
+            cnt += 1;
+        }
+    }
+    (st, ed)
+}
+
+// The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44
+// From<u128> for ByteView
+fn string_view_substr(
+    string_view_array: &StringViewArray,
+    args: &[ArrayRef],
+) -> Result<ArrayRef> {
+    let mut builder = StringViewBuilder::new();
+    // Copy all blocks from input
+    for block in string_view_array.data_buffers() {
+        builder.append_block(block.clone());
+    }
+
+    let start_array = as_int64_array(&args[0])?;
+
+    match args.len() {
+        1 => {
+            for (idx, (raw, start)) in string_view_array
+                .views()
+                .iter()
+                .zip(start_array.iter())
+                .enumerate()
+            {
+                if let Some(start) = start {
+                    let length = *raw as u32;
+                    let start = (start - 1).max(0);
+
+                    // Operate according to the length of bytes
+                    if length == 0 {
+                        builder.append_null();
+                    } else if length > 12 {
+                        let view = ByteView::from(*raw);
+
+                        // Safety:
+                        // 1. idx < string_array.views.size()
+                        // 2. builder is guaranteed to have corresponding 
blocks
+                        unsafe {
+                            let str = string_view_array.value_unchecked(idx);
+                            let (start, end) =
+                                get_true_start_count(str, start as usize, -1);
+                            builder.append_view_unchecked(
+                                view.buffer_index,
+                                view.offset + start as u32,
+                                // guarantee that end-offset >= 0 for end <= 
str.len()
+                                (end - start) as u32,
+                            );
+                        }
+                    } else {
+                        // Safety:
+                        // (1) original bytes are valid utf-8,
+                        // (2) we do not slice on utf-8 codepoint
+                        unsafe {
+                            let bytes =
+                                StringViewArray::inline_value(raw, length as 
usize);

Review Comment:
   I would expect that we never had to call `append_value` -- as I think the 
`substr` calculation can be done entirely in terms of the views (and never 
modify the values in the buffer). Reading this code carefully I do think is the 
case but it is quite implicit (it knows that calling append_value on a string 
that is less than 12 bytes) will not push values values into the buffer
   
   I wonder if we could refactor some of this code into functions with names
   
   Perhaps something like
   
   ```rust
   /// Modify a `view` with length <= 12 bytes so it reflects the substring 
[start..end]
   fn substr_small_view(len: usize, view: u128, start: usize, end: size) -> 
u128 { ... }
   
   /// Modify a `view` with length > 12 bytes so it reflects the substring 
[start..end]
   fn substr_large_view(len: usize, view: u128, start: usize, end: size) -> 
u128 { ... }
   ```
   
   Then we could simply updated the views directly
   
   However, the `StringViewBuilder` doesn't seem to allow for that at the 
moment....



##########
datafusion/functions/src/unicode/substr.rs:
##########
@@ -89,29 +94,193 @@ impl ScalarUDFImpl for SubstrFunc {
     }
 }
 
+/// Extracts the substring of string starting at the start'th character, and 
extending for count characters if that is specified. (Same as substring(string 
from start for count).)
+/// substr('alphabet', 3) = 'phabet'
+/// substr('alphabet', 3, 2) = 'ph'
+/// The implementation uses UTF-8 code points as characters
 pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
-            calculate_substr::<_, i32>(string_array, &args[1..])
+            string_substr::<_, i32>(string_array, &args[1..])
         }
         DataType::LargeUtf8 => {
             let string_array = args[0].as_string::<i64>();
-            calculate_substr::<_, i64>(string_array, &args[1..])
+            string_substr::<_, i64>(string_array, &args[1..])
         }
         DataType::Utf8View => {
             let string_array = args[0].as_string_view();
-            calculate_substr::<_, i32>(string_array, &args[1..])
+            string_view_substr(string_array, &args[1..])
         }
-        other => exec_err!("Unsupported data type {other:?} for function 
substr"),
+        other => exec_err!(
+            "Unsupported data type {other:?} for function substr,\
+            expected Utf8View, Utf8 or LargeUtf8."
+        ),
     }
 }
 
-/// Extracts the substring of string starting at the start'th character, and 
extending for count characters if that is specified. (Same as substring(string 
from start for count).)
-/// substr('alphabet', 3) = 'phabet'
-/// substr('alphabet', 3, 2) = 'ph'
-/// The implementation uses UTF-8 code points as characters
-fn calculate_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) -> 
Result<ArrayRef>
+// Return the exact byte index for [start, end), set count to -1 to ignore 
count
+fn get_true_start_count(input: &str, start: usize, count: i64) -> (usize, 
usize) {
+    let (mut st, mut ed) = (input.len(), input.len());
+    let mut start_counting = false;
+    let mut cnt = 0;
+    for (char_cnt, (byte_cnt, _)) in input.char_indices().enumerate() {
+        if char_cnt == start {
+            st = byte_cnt;
+            if count != -1 {
+                start_counting = true;
+            } else {
+                break;
+            }
+        }
+        if start_counting {
+            if cnt == count {
+                ed = byte_cnt;
+                break;
+            }
+            cnt += 1;
+        }
+    }
+    (st, ed)
+}
+
+// The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44
+// From<u128> for ByteView
+fn string_view_substr(
+    string_view_array: &StringViewArray,
+    args: &[ArrayRef],
+) -> Result<ArrayRef> {
+    let mut builder = StringViewBuilder::new();
+    // Copy all blocks from input
+    for block in string_view_array.data_buffers() {
+        builder.append_block(block.clone());
+    }
+
+    let start_array = as_int64_array(&args[0])?;
+
+    match args.len() {
+        1 => {
+            for (idx, (raw, start)) in string_view_array
+                .views()
+                .iter()
+                .zip(start_array.iter())
+                .enumerate()
+            {
+                if let Some(start) = start {
+                    let length = *raw as u32;
+                    let start = (start - 1).max(0);
+
+                    // Operate according to the length of bytes
+                    if length == 0 {
+                        builder.append_null();
+                    } else if length > 12 {
+                        let view = ByteView::from(*raw);
+
+                        // Safety:
+                        // 1. idx < string_array.views.size()
+                        // 2. builder is guaranteed to have corresponding 
blocks
+                        unsafe {

Review Comment:
   There seems like there is some non trivial duplication here and the clauses 
below



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Improve StringView support for SUBSTR [datafusion]

Reply via email to