This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new fdb2d5761c Improve the performance of ltrim/rtrim/btrim (#10006)
fdb2d5761c is described below
commit fdb2d5761c64273ac7326b4a86b052b9bb9c08c7
Author: JasonLi <[email protected]>
AuthorDate: Thu Apr 11 00:56:09 2024 +0800
Improve the performance of ltrim/rtrim/btrim (#10006)
* optimize trim function
* fix: the second arg is NULL
---
datafusion/functions/Cargo.toml | 5 ++++
datafusion/functions/benches/ltrim.rs | 50 +++++++++++++++++++++++++++++++
datafusion/functions/src/string/btrim.rs | 11 +++++--
datafusion/functions/src/string/common.rs | 17 ++++++++++-
datafusion/functions/src/string/ltrim.rs | 11 +++++--
datafusion/functions/src/string/rtrim.rs | 11 +++++--
6 files changed, 98 insertions(+), 7 deletions(-)
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index a6847f3327..66f8b3010f 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -113,3 +113,8 @@ required-features = ["datetime_expressions"]
harness = false
name = "substr_index"
required-features = ["unicode_expressions"]
+
+[[bench]]
+harness = false
+name = "ltrim"
+required-features = ["string_expressions"]
diff --git a/datafusion/functions/benches/ltrim.rs
b/datafusion/functions/benches/ltrim.rs
new file mode 100644
index 0000000000..01acb9de33
--- /dev/null
+++ b/datafusion/functions/benches/ltrim.rs
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use arrow::array::{ArrayRef, StringArray};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion_common::ScalarValue;
+use datafusion_expr::ColumnarValue;
+use datafusion_functions::string;
+use std::sync::Arc;
+
+fn create_args(size: usize, characters: &str) -> Vec<ColumnarValue> {
+ let iter =
+ std::iter::repeat(format!("{}datafusion{}", characters,
characters)).take(size);
+ let array = Arc::new(StringArray::from_iter_values(iter)) as ArrayRef;
+ vec![
+ ColumnarValue::Array(array),
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(characters.to_string()))),
+ ]
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+ let ltrim = string::ltrim();
+ for char in ["\"", "Header:"] {
+ for size in [1024, 4096, 8192] {
+ let args = create_args(size, char);
+ c.bench_function(&format!("ltrim {}: {}", char, size), |b| {
+ b.iter(|| black_box(ltrim.invoke(&args)))
+ });
+ }
+ }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/src/string/btrim.rs
b/datafusion/functions/src/string/btrim.rs
index b0a85eab6d..971f7bbd4d 100644
--- a/datafusion/functions/src/string/btrim.rs
+++ b/datafusion/functions/src/string/btrim.rs
@@ -24,6 +24,7 @@ use datafusion_common::{exec_err, Result};
use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
+use datafusion_physical_expr::functions::Hint;
use crate::string::common::*;
use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -72,8 +73,14 @@ impl ScalarUDFImpl for BTrimFunc {
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
- DataType::Utf8 => make_scalar_function(btrim::<i32>, vec![])(args),
- DataType::LargeUtf8 => make_scalar_function(btrim::<i64>,
vec![])(args),
+ DataType::Utf8 => make_scalar_function(
+ btrim::<i32>,
+ vec![Hint::Pad, Hint::AcceptsSingular],
+ )(args),
+ DataType::LargeUtf8 => make_scalar_function(
+ btrim::<i64>,
+ vec![Hint::Pad, Hint::AcceptsSingular],
+ )(args),
other => exec_err!("Unsupported data type {other:?} for function
btrim"),
}
}
diff --git a/datafusion/functions/src/string/common.rs
b/datafusion/functions/src/string/common.rs
index 276aad121d..2b554db397 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -18,7 +18,9 @@
use std::fmt::{Display, Formatter};
use std::sync::Arc;
-use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::array::{
+ new_null_array, Array, ArrayRef, GenericStringArray, OffsetSizeTrait,
+};
use arrow::datatypes::DataType;
use datafusion_common::cast::as_generic_string_array;
@@ -78,6 +80,19 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
2 => {
let characters_array = as_generic_string_array::<T>(&args[1])?;
+ if characters_array.len() == 1 {
+ if characters_array.is_null(0) {
+ return Ok(new_null_array(args[0].data_type(),
args[0].len()));
+ }
+
+ let characters = characters_array.value(0);
+ let result = string_array
+ .iter()
+ .map(|item| item.map(|string| func(string, characters)))
+ .collect::<GenericStringArray<T>>();
+ return Ok(Arc::new(result) as ArrayRef);
+ }
+
let result = string_array
.iter()
.zip(characters_array.iter())
diff --git a/datafusion/functions/src/string/ltrim.rs
b/datafusion/functions/src/string/ltrim.rs
index ad86259d0d..1a6a9d497f 100644
--- a/datafusion/functions/src/string/ltrim.rs
+++ b/datafusion/functions/src/string/ltrim.rs
@@ -24,6 +24,7 @@ use datafusion_common::{exec_err, Result};
use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
+use datafusion_physical_expr::functions::Hint;
use crate::string::common::*;
use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -70,8 +71,14 @@ impl ScalarUDFImpl for LtrimFunc {
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
- DataType::Utf8 => make_scalar_function(ltrim::<i32>, vec![])(args),
- DataType::LargeUtf8 => make_scalar_function(ltrim::<i64>,
vec![])(args),
+ DataType::Utf8 => make_scalar_function(
+ ltrim::<i32>,
+ vec![Hint::Pad, Hint::AcceptsSingular],
+ )(args),
+ DataType::LargeUtf8 => make_scalar_function(
+ ltrim::<i64>,
+ vec![Hint::Pad, Hint::AcceptsSingular],
+ )(args),
other => exec_err!("Unsupported data type {other:?} for function
ltrim"),
}
}
diff --git a/datafusion/functions/src/string/rtrim.rs
b/datafusion/functions/src/string/rtrim.rs
index 607e647b26..e6e93e38c9 100644
--- a/datafusion/functions/src/string/rtrim.rs
+++ b/datafusion/functions/src/string/rtrim.rs
@@ -24,6 +24,7 @@ use datafusion_common::{exec_err, Result};
use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
+use datafusion_physical_expr::functions::Hint;
use crate::string::common::*;
use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -70,8 +71,14 @@ impl ScalarUDFImpl for RtrimFunc {
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
- DataType::Utf8 => make_scalar_function(rtrim::<i32>, vec![])(args),
- DataType::LargeUtf8 => make_scalar_function(rtrim::<i64>,
vec![])(args),
+ DataType::Utf8 => make_scalar_function(
+ rtrim::<i32>,
+ vec![Hint::Pad, Hint::AcceptsSingular],
+ )(args),
+ DataType::LargeUtf8 => make_scalar_function(
+ rtrim::<i64>,
+ vec![Hint::Pad, Hint::AcceptsSingular],
+ )(args),
other => exec_err!("Unsupported data type {other:?} for function
rtrim"),
}
}