alamb commented on code in PR #12395:
URL: https://github.com/apache/datafusion/pull/12395#discussion_r1769179503


##########
datafusion/functions/src/string/common.rs:
##########
@@ -21,17 +21,39 @@ use std::fmt::{Display, Formatter};
 use std::sync::Arc;
 
 use arrow::array::{
-    new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, 
ArrayRef,
-    GenericStringArray, GenericStringBuilder, LargeStringArray, 
OffsetSizeTrait,
-    StringArray, StringBuilder, StringViewArray, StringViewBuilder,
+    make_view, new_null_array, Array, ArrayAccessor, ArrayDataBuilder, 
ArrayIter,
+    ArrayRef, ByteView, GenericStringArray, GenericStringBuilder, 
LargeStringArray,
+    OffsetSizeTrait, StringArray, StringBuilder, StringViewArray, 
StringViewBuilder,
 };
 use arrow::buffer::{Buffer, MutableBuffer, NullBuffer};
 use arrow::datatypes::DataType;
+use arrow_buffer::{NullBufferBuilder, ScalarBuffer};
 use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
 use datafusion_common::Result;
 use datafusion_common::{exec_err, ScalarValue};
 use datafusion_expr::ColumnarValue;
 
+/// Make a `u128` based on the given substr, start(offset to view.offset), and
+/// push into to the given buffers

Review Comment:
   ```suggestion
   /// Append a new view to the views buffer with the given substr
   ///
   /// raw must be a valid view
   /// substr must be a valid substring of raw
   /// start must be less than or equal to the length of the string data
   ```



##########
datafusion/functions/src/string/common.rs:
##########
@@ -21,17 +21,39 @@ use std::fmt::{Display, Formatter};
 use std::sync::Arc;
 
 use arrow::array::{
-    new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, 
ArrayRef,
-    GenericStringArray, GenericStringBuilder, LargeStringArray, 
OffsetSizeTrait,
-    StringArray, StringBuilder, StringViewArray, StringViewBuilder,
+    make_view, new_null_array, Array, ArrayAccessor, ArrayDataBuilder, 
ArrayIter,
+    ArrayRef, ByteView, GenericStringArray, GenericStringBuilder, 
LargeStringArray,
+    OffsetSizeTrait, StringArray, StringBuilder, StringViewArray, 
StringViewBuilder,
 };
 use arrow::buffer::{Buffer, MutableBuffer, NullBuffer};
 use arrow::datatypes::DataType;
+use arrow_buffer::{NullBufferBuilder, ScalarBuffer};
 use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
 use datafusion_common::Result;
 use datafusion_common::{exec_err, ScalarValue};
 use datafusion_expr::ColumnarValue;
 
+/// Make a `u128` based on the given substr, start(offset to view.offset), and
+/// push into to the given buffers
+pub(crate) fn make_and_append_view(

Review Comment:
   It seems like what would be really useful is a `StringViewBuilder` that 
could be modified perhaps 🤔 
   
   I started to write a ticket in arrow-rs but I didn't know exactly what API 
to suggest. I think we would have to try it out



##########
datafusion/functions/src/string/common.rs:
##########
@@ -72,65 +94,126 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
     };
 
     if use_string_view {
-        string_view_trim::<T>(func, args)
+        string_view_trim(func, args)
     } else {
         string_trim::<T>(func, args)
     }
 }
 
 // removing 'a will cause compiler complaining lifetime of `func`
-fn string_view_trim<'a, T: OffsetSizeTrait>(
-    func: fn(&'a str, &'a str) -> &'a str,
+fn string_view_trim<'a>(
+    trim_func: fn(&'a str, &'a str) -> &'a str,
     args: &'a [ArrayRef],
 ) -> Result<ArrayRef> {
-    let string_array = as_string_view_array(&args[0])?;
+    let string_view_array = as_string_view_array(&args[0])?;
+    let mut views_buf = Vec::with_capacity(string_view_array.len());
+    let mut null_builder = NullBufferBuilder::new(string_view_array.len());
 
     match args.len() {
         1 => {
-            let result = string_array
-                .iter()
-                .map(|string| string.map(|string: &str| func(string, " ")))
-                .collect::<GenericStringArray<T>>();
-
-            Ok(Arc::new(result) as ArrayRef)
+            let array_iter = string_view_array.iter();
+            let views_iter = string_view_array.views().iter();
+            for (src_str_opt, raw_view) in array_iter.zip(views_iter) {
+                trim_and_append_str(
+                    src_str_opt,
+                    Some(" "),
+                    trim_func,
+                    &mut views_buf,
+                    &mut null_builder,
+                    raw_view,
+                );
+            }
         }
         2 => {
             let characters_array = as_string_view_array(&args[1])?;
 
             if characters_array.len() == 1 {
+                // Only one `trim characters` exist
                 if characters_array.is_null(0) {
                     return Ok(new_null_array(
                         // The schema is expecting utf8 as null
-                        &DataType::Utf8,
-                        string_array.len(),
+                        &DataType::Utf8View,
+                        string_view_array.len(),
                     ));
                 }
 
                 let characters = characters_array.value(0);
-                let result = string_array
-                    .iter()
-                    .map(|item| item.map(|string| func(string, characters)))
-                    .collect::<GenericStringArray<T>>();
-                return Ok(Arc::new(result) as ArrayRef);
+                let array_iter = string_view_array.iter();
+                let views_iter = string_view_array.views().iter();
+                for (src_str_opt, raw_view) in array_iter.zip(views_iter) {
+                    trim_and_append_str(
+                        src_str_opt,
+                        Some(characters),
+                        trim_func,
+                        &mut views_buf,
+                        &mut null_builder,
+                        raw_view,
+                    );
+                }
+            } else {
+                // A specific `trim characters` for a row in the string view 
array
+                let characters_iter = characters_array.iter();
+                let array_iter = string_view_array.iter();
+                let views_iter = string_view_array.views().iter();
+                for ((src_str_opt, raw_view), characters_opt) in
+                    array_iter.zip(views_iter).zip(characters_iter)
+                {
+                    trim_and_append_str(
+                        src_str_opt,
+                        characters_opt,
+                        trim_func,
+                        &mut views_buf,
+                        &mut null_builder,
+                        raw_view,
+                    );
+                }
             }
-
-            let result = string_array
-                .iter()
-                .zip(characters_array.iter())
-                .map(|(string, characters)| match (string, characters) {
-                    (Some(string), Some(characters)) => Some(func(string, 
characters)),
-                    _ => None,
-                })
-                .collect::<GenericStringArray<T>>();
-
-            Ok(Arc::new(result) as ArrayRef)
         }
         other => {
-            exec_err!(
+            return exec_err!(
             "Function TRIM was called with {other} arguments. It requires at 
least 1 and at most 2."
-            )
+            );
         }
     }
+
+    let views_buf = ScalarBuffer::from(views_buf);
+    let nulls_buf = null_builder.finish();
+
+    // Safety:
+    // (1) The blocks of the given views are all provided
+    // (2) Each of the range `view.offset+start..end` of view in views_buf is 
within
+    // the bounds of each of the blocks
+    unsafe {
+        let array = StringViewArray::new_unchecked(
+            views_buf,
+            string_view_array.data_buffers().to_vec(),
+            nulls_buf,
+        );
+        Ok(Arc::new(array) as ArrayRef)
+    }
+}
+
+fn trim_and_append_str<'a>(
+    src_str_opt: Option<&'a str>,
+    trim_characters_opt: Option<&'a str>,
+    trim_func: fn(&'a str, &'a str) -> &'a str,
+    views_buf: &mut Vec<u128>,
+    null_builder: &mut NullBufferBuilder,
+    raw: &u128,
+) {
+    if let (Some(src_str), Some(characters)) = (src_str_opt, 
trim_characters_opt) {
+        let trim_str = trim_func(src_str, characters);
+
+        // Safety:
+        // `trim_str` is computed from `str::trim_xxx_matches`,
+        // and its addr is ensured to be >= `origin_str`'s
+        let start = unsafe { trim_str.as_ptr().offset_from(src_str.as_ptr()) 
as u32 };

Review Comment:
   If you need the length of this in bytes, I don't think you need unsafe here
   
   How about this:
   ```suggestion
           let start = (src_str.as_bytes().iter().len() - 
trim_str.as_bytes().len())  as u32;
   ```



##########
datafusion/functions/src/string/common.rs:
##########
@@ -72,65 +94,126 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
     };
 
     if use_string_view {
-        string_view_trim::<T>(func, args)
+        string_view_trim(func, args)
     } else {
         string_trim::<T>(func, args)
     }
 }
 
 // removing 'a will cause compiler complaining lifetime of `func`
-fn string_view_trim<'a, T: OffsetSizeTrait>(
-    func: fn(&'a str, &'a str) -> &'a str,
+fn string_view_trim<'a>(
+    trim_func: fn(&'a str, &'a str) -> &'a str,
     args: &'a [ArrayRef],
 ) -> Result<ArrayRef> {
-    let string_array = as_string_view_array(&args[0])?;
+    let string_view_array = as_string_view_array(&args[0])?;
+    let mut views_buf = Vec::with_capacity(string_view_array.len());
+    let mut null_builder = NullBufferBuilder::new(string_view_array.len());
 
     match args.len() {
         1 => {
-            let result = string_array
-                .iter()
-                .map(|string| string.map(|string: &str| func(string, " ")))
-                .collect::<GenericStringArray<T>>();
-
-            Ok(Arc::new(result) as ArrayRef)
+            let array_iter = string_view_array.iter();
+            let views_iter = string_view_array.views().iter();
+            for (src_str_opt, raw_view) in array_iter.zip(views_iter) {
+                trim_and_append_str(
+                    src_str_opt,
+                    Some(" "),
+                    trim_func,
+                    &mut views_buf,
+                    &mut null_builder,
+                    raw_view,
+                );
+            }
         }
         2 => {
             let characters_array = as_string_view_array(&args[1])?;
 
             if characters_array.len() == 1 {
+                // Only one `trim characters` exist
                 if characters_array.is_null(0) {
                     return Ok(new_null_array(
                         // The schema is expecting utf8 as null
-                        &DataType::Utf8,
-                        string_array.len(),
+                        &DataType::Utf8View,
+                        string_view_array.len(),
                     ));
                 }
 
                 let characters = characters_array.value(0);
-                let result = string_array
-                    .iter()
-                    .map(|item| item.map(|string| func(string, characters)))
-                    .collect::<GenericStringArray<T>>();
-                return Ok(Arc::new(result) as ArrayRef);
+                let array_iter = string_view_array.iter();
+                let views_iter = string_view_array.views().iter();
+                for (src_str_opt, raw_view) in array_iter.zip(views_iter) {
+                    trim_and_append_str(
+                        src_str_opt,
+                        Some(characters),
+                        trim_func,
+                        &mut views_buf,
+                        &mut null_builder,
+                        raw_view,
+                    );
+                }
+            } else {
+                // A specific `trim characters` for a row in the string view 
array
+                let characters_iter = characters_array.iter();
+                let array_iter = string_view_array.iter();
+                let views_iter = string_view_array.views().iter();
+                for ((src_str_opt, raw_view), characters_opt) in
+                    array_iter.zip(views_iter).zip(characters_iter)
+                {
+                    trim_and_append_str(
+                        src_str_opt,
+                        characters_opt,
+                        trim_func,
+                        &mut views_buf,
+                        &mut null_builder,
+                        raw_view,
+                    );
+                }
             }
-
-            let result = string_array
-                .iter()
-                .zip(characters_array.iter())
-                .map(|(string, characters)| match (string, characters) {
-                    (Some(string), Some(characters)) => Some(func(string, 
characters)),
-                    _ => None,
-                })
-                .collect::<GenericStringArray<T>>();
-
-            Ok(Arc::new(result) as ArrayRef)
         }
         other => {
-            exec_err!(
+            return exec_err!(
             "Function TRIM was called with {other} arguments. It requires at 
least 1 and at most 2."
-            )
+            );
         }
     }
+
+    let views_buf = ScalarBuffer::from(views_buf);
+    let nulls_buf = null_builder.finish();
+
+    // Safety:

Review Comment:
   Again I think using StringBuilder here might improve performance



##########
datafusion/functions/src/string/common.rs:
##########
@@ -72,65 +94,126 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
     };
 
     if use_string_view {
-        string_view_trim::<T>(func, args)
+        string_view_trim(func, args)
     } else {
         string_trim::<T>(func, args)
     }
 }
 
 // removing 'a will cause compiler complaining lifetime of `func`
-fn string_view_trim<'a, T: OffsetSizeTrait>(
-    func: fn(&'a str, &'a str) -> &'a str,
+fn string_view_trim<'a>(
+    trim_func: fn(&'a str, &'a str) -> &'a str,
     args: &'a [ArrayRef],
 ) -> Result<ArrayRef> {
-    let string_array = as_string_view_array(&args[0])?;
+    let string_view_array = as_string_view_array(&args[0])?;
+    let mut views_buf = Vec::with_capacity(string_view_array.len());
+    let mut null_builder = NullBufferBuilder::new(string_view_array.len());

Review Comment:
   I wonder if you could use `StringViewBuilder` here rather than building up 
the views buffer directly ?
   
   https://docs.rs/arrow/latest/arrow/array/type.StringViewBuilder.html



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to