This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new fdb2d5761c Improve the performance of ltrim/rtrim/btrim (#10006)
fdb2d5761c is described below

commit fdb2d5761c64273ac7326b4a86b052b9bb9c08c7
Author: JasonLi <[email protected]>
AuthorDate: Thu Apr 11 00:56:09 2024 +0800

    Improve the performance of ltrim/rtrim/btrim (#10006)
    
    * optimize trim function
    
    * fix: the second arg is NULL
---
 datafusion/functions/Cargo.toml           |  5 ++++
 datafusion/functions/benches/ltrim.rs     | 50 +++++++++++++++++++++++++++++++
 datafusion/functions/src/string/btrim.rs  | 11 +++++--
 datafusion/functions/src/string/common.rs | 17 ++++++++++-
 datafusion/functions/src/string/ltrim.rs  | 11 +++++--
 datafusion/functions/src/string/rtrim.rs  | 11 +++++--
 6 files changed, 98 insertions(+), 7 deletions(-)

diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index a6847f3327..66f8b3010f 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -113,3 +113,8 @@ required-features = ["datetime_expressions"]
 harness = false
 name = "substr_index"
 required-features = ["unicode_expressions"]
+
+[[bench]]
+harness = false
+name = "ltrim"
+required-features = ["string_expressions"]
diff --git a/datafusion/functions/benches/ltrim.rs 
b/datafusion/functions/benches/ltrim.rs
new file mode 100644
index 0000000000..01acb9de33
--- /dev/null
+++ b/datafusion/functions/benches/ltrim.rs
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use arrow::array::{ArrayRef, StringArray};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion_common::ScalarValue;
+use datafusion_expr::ColumnarValue;
+use datafusion_functions::string;
+use std::sync::Arc;
+
+fn create_args(size: usize, characters: &str) -> Vec<ColumnarValue> {
+    let iter =
+        std::iter::repeat(format!("{}datafusion{}", characters, 
characters)).take(size);
+    let array = Arc::new(StringArray::from_iter_values(iter)) as ArrayRef;
+    vec![
+        ColumnarValue::Array(array),
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(characters.to_string()))),
+    ]
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let ltrim = string::ltrim();
+    for char in ["\"", "Header:"] {
+        for size in [1024, 4096, 8192] {
+            let args = create_args(size, char);
+            c.bench_function(&format!("ltrim {}: {}", char, size), |b| {
+                b.iter(|| black_box(ltrim.invoke(&args)))
+            });
+        }
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/src/string/btrim.rs 
b/datafusion/functions/src/string/btrim.rs
index b0a85eab6d..971f7bbd4d 100644
--- a/datafusion/functions/src/string/btrim.rs
+++ b/datafusion/functions/src/string/btrim.rs
@@ -24,6 +24,7 @@ use datafusion_common::{exec_err, Result};
 use datafusion_expr::TypeSignature::*;
 use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
+use datafusion_physical_expr::functions::Hint;
 
 use crate::string::common::*;
 use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -72,8 +73,14 @@ impl ScalarUDFImpl for BTrimFunc {
 
     fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
         match args[0].data_type() {
-            DataType::Utf8 => make_scalar_function(btrim::<i32>, vec![])(args),
-            DataType::LargeUtf8 => make_scalar_function(btrim::<i64>, 
vec![])(args),
+            DataType::Utf8 => make_scalar_function(
+                btrim::<i32>,
+                vec![Hint::Pad, Hint::AcceptsSingular],
+            )(args),
+            DataType::LargeUtf8 => make_scalar_function(
+                btrim::<i64>,
+                vec![Hint::Pad, Hint::AcceptsSingular],
+            )(args),
             other => exec_err!("Unsupported data type {other:?} for function 
btrim"),
         }
     }
diff --git a/datafusion/functions/src/string/common.rs 
b/datafusion/functions/src/string/common.rs
index 276aad121d..2b554db397 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -18,7 +18,9 @@
 use std::fmt::{Display, Formatter};
 use std::sync::Arc;
 
-use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::array::{
+    new_null_array, Array, ArrayRef, GenericStringArray, OffsetSizeTrait,
+};
 use arrow::datatypes::DataType;
 
 use datafusion_common::cast::as_generic_string_array;
@@ -78,6 +80,19 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
         2 => {
             let characters_array = as_generic_string_array::<T>(&args[1])?;
 
+            if characters_array.len() == 1 {
+                if characters_array.is_null(0) {
+                    return Ok(new_null_array(args[0].data_type(), 
args[0].len()));
+                }
+
+                let characters = characters_array.value(0);
+                let result = string_array
+                    .iter()
+                    .map(|item| item.map(|string| func(string, characters)))
+                    .collect::<GenericStringArray<T>>();
+                return Ok(Arc::new(result) as ArrayRef);
+            }
+
             let result = string_array
                 .iter()
                 .zip(characters_array.iter())
diff --git a/datafusion/functions/src/string/ltrim.rs 
b/datafusion/functions/src/string/ltrim.rs
index ad86259d0d..1a6a9d497f 100644
--- a/datafusion/functions/src/string/ltrim.rs
+++ b/datafusion/functions/src/string/ltrim.rs
@@ -24,6 +24,7 @@ use datafusion_common::{exec_err, Result};
 use datafusion_expr::TypeSignature::*;
 use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
+use datafusion_physical_expr::functions::Hint;
 
 use crate::string::common::*;
 use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -70,8 +71,14 @@ impl ScalarUDFImpl for LtrimFunc {
 
     fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
         match args[0].data_type() {
-            DataType::Utf8 => make_scalar_function(ltrim::<i32>, vec![])(args),
-            DataType::LargeUtf8 => make_scalar_function(ltrim::<i64>, 
vec![])(args),
+            DataType::Utf8 => make_scalar_function(
+                ltrim::<i32>,
+                vec![Hint::Pad, Hint::AcceptsSingular],
+            )(args),
+            DataType::LargeUtf8 => make_scalar_function(
+                ltrim::<i64>,
+                vec![Hint::Pad, Hint::AcceptsSingular],
+            )(args),
             other => exec_err!("Unsupported data type {other:?} for function 
ltrim"),
         }
     }
diff --git a/datafusion/functions/src/string/rtrim.rs 
b/datafusion/functions/src/string/rtrim.rs
index 607e647b26..e6e93e38c9 100644
--- a/datafusion/functions/src/string/rtrim.rs
+++ b/datafusion/functions/src/string/rtrim.rs
@@ -24,6 +24,7 @@ use datafusion_common::{exec_err, Result};
 use datafusion_expr::TypeSignature::*;
 use datafusion_expr::{ColumnarValue, Volatility};
 use datafusion_expr::{ScalarUDFImpl, Signature};
+use datafusion_physical_expr::functions::Hint;
 
 use crate::string::common::*;
 use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -70,8 +71,14 @@ impl ScalarUDFImpl for RtrimFunc {
 
     fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
         match args[0].data_type() {
-            DataType::Utf8 => make_scalar_function(rtrim::<i32>, vec![])(args),
-            DataType::LargeUtf8 => make_scalar_function(rtrim::<i64>, 
vec![])(args),
+            DataType::Utf8 => make_scalar_function(
+                rtrim::<i32>,
+                vec![Hint::Pad, Hint::AcceptsSingular],
+            )(args),
+            DataType::LargeUtf8 => make_scalar_function(
+                rtrim::<i64>,
+                vec![Hint::Pad, Hint::AcceptsSingular],
+            )(args),
             other => exec_err!("Unsupported data type {other:?} for function 
rtrim"),
         }
     }

Reply via email to