Re: [PR] feat: optimize lower and upper functions [arrow-datafusion]

via GitHub Wed, 10 Apr 2024 02:35:04 -0700


Dandandan commented on code in PR #9971:
URL: https://github.com/apache/arrow-datafusion/pull/9971#discussion_r1559142821



##########
datafusion/functions/src/string/common.rs:
##########
@@ -97,80 +101,145 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
     }
 }
 
-/// applies a unary expression to `args[0]` that is expected to be 
downcastable to
-/// a `GenericStringArray` and returns a `GenericStringArray` (which may have 
a different offset)
-/// # Errors
-/// This function errors when:
-/// * the number of arguments is not 1
-/// * the first argument is not castable to a `GenericStringArray`
-pub(crate) fn unary_string_function<'a, T, O, F, R>(
-    args: &[&'a dyn Array],
-    op: F,
-    name: &str,
-) -> Result<GenericStringArray<O>>
-where
-    R: AsRef<str>,
-    O: OffsetSizeTrait,
-    T: OffsetSizeTrait,
-    F: Fn(&'a str) -> R,
-{
-    if args.len() != 1 {
-        return exec_err!(
-            "{:?} args were supplied but {} takes exactly one argument",
-            args.len(),
-            name
-        );
-    }
-
-    let string_array = as_generic_string_array::<T>(args[0])?;
+pub(crate) fn to_lower(args: &[ColumnarValue], name: &str) -> 
Result<ColumnarValue> {
+    case_conversion(args, |string| string.to_lowercase(), name)
+}
 
-    // first map is the iterator, second is for the `Option<_>`
-    Ok(string_array.iter().map(|string| string.map(&op)).collect())
+pub(crate) fn to_upper(args: &[ColumnarValue], name: &str) -> 
Result<ColumnarValue> {
+    case_conversion(args, |string| string.to_uppercase(), name)
 }
 
-pub(crate) fn handle<'a, F, R>(
+fn case_conversion<'a, F>(
     args: &'a [ColumnarValue],
     op: F,
     name: &str,
 ) -> Result<ColumnarValue>
 where
-    R: AsRef<str>,
-    F: Fn(&'a str) -> R,
+    F: Fn(&'a str) -> String,
 {
     match &args[0] {
-        ColumnarValue::Array(a) => match a.data_type() {
-            DataType::Utf8 => {
-                Ok(ColumnarValue::Array(Arc::new(unary_string_function::<
-                    i32,
-                    i32,
-                    _,
-                    _,
-                >(
-                    &[a.as_ref()], op, name
-                )?)))
-            }
-            DataType::LargeUtf8 => {
-                Ok(ColumnarValue::Array(Arc::new(unary_string_function::<
-                    i64,
-                    i64,
-                    _,
-                    _,
-                >(
-                    &[a.as_ref()], op, name
-                )?)))
-            }
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Utf8 => 
Ok(ColumnarValue::Array(case_conversion_array::<i32, _>(
+                array, op,
+            )?)),
+            DataType::LargeUtf8 => 
Ok(ColumnarValue::Array(case_conversion_array::<
+                i64,
+                _,
+            >(array, op)?)),
             other => exec_err!("Unsupported data type {other:?} for function 
{name}"),
         },
         ColumnarValue::Scalar(scalar) => match scalar {
             ScalarValue::Utf8(a) => {
-                let result = a.as_ref().map(|x| (op)(x).as_ref().to_string());
+                let result = a.as_ref().map(|x| op(x));
                 Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result)))
             }
             ScalarValue::LargeUtf8(a) => {
-                let result = a.as_ref().map(|x| (op)(x).as_ref().to_string());
+                let result = a.as_ref().map(|x| op(x));
                 Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(result)))
             }
             other => exec_err!("Unsupported data type {other:?} for function 
{name}"),
         },
     }
 }
+
+fn case_conversion_array<'a, O, F>(array: &'a ArrayRef, op: F) -> 
Result<ArrayRef>
+where
+    O: OffsetSizeTrait,
+    F: Fn(&'a str) -> String,
+{
+    let string_array = as_generic_string_array::<O>(array)?;
+    let item_len = string_array.len();
+
+    // Find the first nonascii string at the beginning.
+    let find_the_first_nonascii = || {

Review Comment:
   AFAIK it is quite a bit faster to do the check once on the entire 
string/byte array (including nulls), than to check it individually.
   This should simplify the logic as well, e.g. not searching for the index but 
only do it when the entire array is ascii.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat: optimize lower and upper functions [arrow-datafusion]

Reply via email to