Rachelint commented on code in PR #12395:
URL: https://github.com/apache/datafusion/pull/12395#discussion_r1769415355


##########
datafusion/functions/src/string/common.rs:
##########
@@ -72,65 +94,126 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
     };
 
     if use_string_view {
-        string_view_trim::<T>(func, args)
+        string_view_trim(func, args)
     } else {
         string_trim::<T>(func, args)
     }
 }
 
 // removing 'a will cause compiler complaining lifetime of `func`
-fn string_view_trim<'a, T: OffsetSizeTrait>(
-    func: fn(&'a str, &'a str) -> &'a str,
+fn string_view_trim<'a>(
+    trim_func: fn(&'a str, &'a str) -> &'a str,
     args: &'a [ArrayRef],
 ) -> Result<ArrayRef> {
-    let string_array = as_string_view_array(&args[0])?;
+    let string_view_array = as_string_view_array(&args[0])?;
+    let mut views_buf = Vec::with_capacity(string_view_array.len());
+    let mut null_builder = NullBufferBuilder::new(string_view_array.len());
 
     match args.len() {
         1 => {
-            let result = string_array
-                .iter()
-                .map(|string| string.map(|string: &str| func(string, " ")))
-                .collect::<GenericStringArray<T>>();
-
-            Ok(Arc::new(result) as ArrayRef)
+            let array_iter = string_view_array.iter();
+            let views_iter = string_view_array.views().iter();
+            for (src_str_opt, raw_view) in array_iter.zip(views_iter) {
+                trim_and_append_str(
+                    src_str_opt,
+                    Some(" "),
+                    trim_func,
+                    &mut views_buf,
+                    &mut null_builder,
+                    raw_view,
+                );
+            }
         }
         2 => {
             let characters_array = as_string_view_array(&args[1])?;
 
             if characters_array.len() == 1 {
+                // Only one `trim characters` exist
                 if characters_array.is_null(0) {
                     return Ok(new_null_array(
                         // The schema is expecting utf8 as null
-                        &DataType::Utf8,
-                        string_array.len(),
+                        &DataType::Utf8View,
+                        string_view_array.len(),
                     ));
                 }
 
                 let characters = characters_array.value(0);
-                let result = string_array
-                    .iter()
-                    .map(|item| item.map(|string| func(string, characters)))
-                    .collect::<GenericStringArray<T>>();
-                return Ok(Arc::new(result) as ArrayRef);
+                let array_iter = string_view_array.iter();
+                let views_iter = string_view_array.views().iter();
+                for (src_str_opt, raw_view) in array_iter.zip(views_iter) {
+                    trim_and_append_str(
+                        src_str_opt,
+                        Some(characters),
+                        trim_func,
+                        &mut views_buf,
+                        &mut null_builder,
+                        raw_view,
+                    );
+                }
+            } else {
+                // A specific `trim characters` for a row in the string view 
array
+                let characters_iter = characters_array.iter();
+                let array_iter = string_view_array.iter();
+                let views_iter = string_view_array.views().iter();
+                for ((src_str_opt, raw_view), characters_opt) in
+                    array_iter.zip(views_iter).zip(characters_iter)
+                {
+                    trim_and_append_str(
+                        src_str_opt,
+                        characters_opt,
+                        trim_func,
+                        &mut views_buf,
+                        &mut null_builder,
+                        raw_view,
+                    );
+                }
             }
-
-            let result = string_array
-                .iter()
-                .zip(characters_array.iter())
-                .map(|(string, characters)| match (string, characters) {
-                    (Some(string), Some(characters)) => Some(func(string, 
characters)),
-                    _ => None,
-                })
-                .collect::<GenericStringArray<T>>();
-
-            Ok(Arc::new(result) as ArrayRef)
         }
         other => {
-            exec_err!(
+            return exec_err!(
             "Function TRIM was called with {other} arguments. It requires at 
least 1 and at most 2."
-            )
+            );
         }
     }
+
+    let views_buf = ScalarBuffer::from(views_buf);
+    let nulls_buf = null_builder.finish();
+
+    // Safety:
+    // (1) The blocks of the given views are all provided
+    // (2) Each of the range `view.offset+start..end` of view in views_buf is 
within
+    // the bounds of each of the blocks
+    unsafe {
+        let array = StringViewArray::new_unchecked(
+            views_buf,
+            string_view_array.data_buffers().to_vec(),
+            nulls_buf,
+        );
+        Ok(Arc::new(array) as ArrayRef)
+    }
+}
+
+fn trim_and_append_str<'a>(
+    src_str_opt: Option<&'a str>,
+    trim_characters_opt: Option<&'a str>,
+    trim_func: fn(&'a str, &'a str) -> &'a str,
+    views_buf: &mut Vec<u128>,
+    null_builder: &mut NullBufferBuilder,
+    raw: &u128,
+) {
+    if let (Some(src_str), Some(characters)) = (src_str_opt, 
trim_characters_opt) {
+        let trim_str = trim_func(src_str, characters);
+
+        // Safety:
+        // `trim_str` is computed from `str::trim_xxx_matches`,
+        // and its addr is ensured to be >= `origin_str`'s
+        let start = unsafe { trim_str.as_ptr().offset_from(src_str.as_ptr()) 
as u32 };

Review Comment:
   > If you need the length of this in bytes, I don't think you need unsafe here
   > 
   > How about this:
   
   I want to get the `trim_str`'s "start offest" from `src_str` actually. 
   For example,
   - Assume that `src_str`: "  abc  ", after btrim, `trim_str`: "abc".
   - The start of `src_str` is ' '(offset 0), the new start of `trim_str` is 
'a' (offset 2), and I expected the relative offset 2 (2 - 0).
   - And `len(src_str) - len(trim_str)` is 4.
   
   Maybe wen can only make it using unsafe temperarily 
https://github.com/apache/datafusion/issues/12387
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to