Re: [PR] Improve StringView support for SUBSTR [datafusion]

via GitHub Fri, 23 Aug 2024 00:31:40 -0700


Kev1n8 commented on code in PR #12044:
URL: https://github.com/apache/datafusion/pull/12044#discussion_r1727098779



##########
datafusion/functions/src/unicode/substr.rs:
##########
@@ -89,29 +94,193 @@ impl ScalarUDFImpl for SubstrFunc {
     }
 }
 
+/// Extracts the substring of string starting at the start'th character, and 
extending for count characters if that is specified. (Same as substring(string 
from start for count).)
+/// substr('alphabet', 3) = 'phabet'
+/// substr('alphabet', 3, 2) = 'ph'
+/// The implementation uses UTF-8 code points as characters
 pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
-            calculate_substr::<_, i32>(string_array, &args[1..])
+            string_substr::<_, i32>(string_array, &args[1..])
         }
         DataType::LargeUtf8 => {
             let string_array = args[0].as_string::<i64>();
-            calculate_substr::<_, i64>(string_array, &args[1..])
+            string_substr::<_, i64>(string_array, &args[1..])
         }
         DataType::Utf8View => {
             let string_array = args[0].as_string_view();
-            calculate_substr::<_, i32>(string_array, &args[1..])
+            string_view_substr(string_array, &args[1..])
         }
-        other => exec_err!("Unsupported data type {other:?} for function 
substr"),
+        other => exec_err!(
+            "Unsupported data type {other:?} for function substr,\
+            expected Utf8View, Utf8 or LargeUtf8."
+        ),
     }
 }
 
-/// Extracts the substring of string starting at the start'th character, and 
extending for count characters if that is specified. (Same as substring(string 
from start for count).)
-/// substr('alphabet', 3) = 'phabet'
-/// substr('alphabet', 3, 2) = 'ph'
-/// The implementation uses UTF-8 code points as characters
-fn calculate_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) -> 
Result<ArrayRef>
+// Return the exact byte index for [start, end), set count to -1 to ignore 
count
+fn get_true_start_count(input: &str, start: usize, count: i64) -> (usize, 
usize) {
+    let (mut st, mut ed) = (input.len(), input.len());
+    let mut start_counting = false;
+    let mut cnt = 0;
+    for (char_cnt, (byte_cnt, _)) in input.char_indices().enumerate() {
+        if char_cnt == start {
+            st = byte_cnt;
+            if count != -1 {
+                start_counting = true;
+            } else {
+                break;
+            }
+        }
+        if start_counting {
+            if cnt == count {
+                ed = byte_cnt;
+                break;
+            }
+            cnt += 1;
+        }
+    }
+    (st, ed)
+}
+
+// The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44
+// From<u128> for ByteView
+fn string_view_substr(
+    string_view_array: &StringViewArray,
+    args: &[ArrayRef],
+) -> Result<ArrayRef> {
+    let mut builder = StringViewBuilder::new();
+    // Copy all blocks from input
+    for block in string_view_array.data_buffers() {
+        builder.append_block(block.clone());
+    }
+
+    let start_array = as_int64_array(&args[0])?;
+
+    match args.len() {
+        1 => {
+            for (idx, (raw, start)) in string_view_array
+                .views()
+                .iter()
+                .zip(start_array.iter())
+                .enumerate()
+            {
+                if let Some(start) = start {
+                    let length = *raw as u32;
+                    let start = (start - 1).max(0);
+
+                    // Operate according to the length of bytes
+                    if length == 0 {
+                        builder.append_null();
+                    } else if length > 12 {
+                        let view = ByteView::from(*raw);
+
+                        // Safety:
+                        // 1. idx < string_array.views.size()
+                        // 2. builder is guaranteed to have corresponding 
blocks
+                        unsafe {
+                            let str = string_view_array.value_unchecked(idx);
+                            let (start, end) =
+                                get_true_start_count(str, start as usize, -1);
+                            builder.append_view_unchecked(
+                                view.buffer_index,
+                                view.offset + start as u32,
+                                // guarantee that end-offset >= 0 for end <= 
str.len()
+                                (end - start) as u32,
+                            );
+                        }
+                    } else {
+                        // Safety:
+                        // (1) original bytes are valid utf-8,
+                        // (2) we do not slice on utf-8 codepoint
+                        unsafe {
+                            let bytes =
+                                StringViewArray::inline_value(raw, length as 
usize);

Review Comment:
   Maybe we could add a function like `append_view_u128_unchecked(view: u128)` 
in arrow/src/builder/generic_bytes_view_builder.rs to simply add a view with a 
given `u128`. Then the whole process would be:
   1. Get the str of the view by `value_unchecked`, then get the [start, end)
   2. `sub_view = ~~if end-start>12 substr_large_view() else 
substr_small_view()`~~ make_view()
   3. call `appned_view_u128_unchecked(sub_view)` on the builder



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Improve StringView support for SUBSTR [datafusion]

Reply via email to