This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new ace9cd44b7 perf: Optimize trim UDFs for single-character trims (#20328)
ace9cd44b7 is described below
commit ace9cd44b7356d60e6d69d0b98ac3f5606d55507
Author: Neil Conway <[email protected]>
AuthorDate: Fri Feb 20 04:28:53 2026 -0500
perf: Optimize trim UDFs for single-character trims (#20328)
## Which issue does this PR close?
- Closes #20327
## Rationale for this change
By default, btrim(), ltrim(), and rtrim() trim space characters; it is
also reasonably common for queries to specify a non-default trim pattern
that is still a single ASCII character.
We can optimize for this case by doing a byte-level scan, rather than
invoking the more heavyweight std::string machinery used for more
complex trim scenarios.
## What changes are included in this PR?
Add a benchmark for trimming spaces, and implement the optimization
described above. Also fixed an error in the documentation.
## Are these changes tested?
Yes, and benchmarked.
## Are there any user-facing changes?
No.
---------
Co-authored-by: Martin Grigorov <[email protected]>
---
datafusion/functions/benches/trim.rs | 132 +++++++++++++++++++++++
datafusion/functions/src/string/btrim.rs | 6 +-
datafusion/functions/src/string/common.rs | 82 +++++++++++---
datafusion/functions/src/string/ltrim.rs | 6 +-
datafusion/functions/src/string/rtrim.rs | 6 +-
datafusion/sqllogictest/test_files/functions.slt | 9 ++
docs/source/user-guide/sql/scalar_functions.md | 12 +--
7 files changed, 224 insertions(+), 29 deletions(-)
diff --git a/datafusion/functions/benches/trim.rs
b/datafusion/functions/benches/trim.rs
index 23a53eefb2..21d99592d1 100644
--- a/datafusion/functions/benches/trim.rs
+++ b/datafusion/functions/benches/trim.rs
@@ -141,6 +141,45 @@ fn create_args(
]
}
+/// Create args for trim benchmark where space characters are being trimmed
+fn create_space_trim_args(
+ size: usize,
+ pad_len: usize,
+ remaining_len: usize,
+ string_array_type: StringArrayType,
+ trim_type: TrimType,
+) -> Vec<ColumnarValue> {
+ let rng = &mut StdRng::seed_from_u64(42);
+ let spaces = " ".repeat(pad_len);
+
+ let string_iter = (0..size).map(|_| {
+ if rng.random::<f32>() < 0.1 {
+ None
+ } else {
+ let content: String = rng
+ .sample_iter(&Alphanumeric)
+ .take(remaining_len)
+ .map(char::from)
+ .collect();
+
+ let value = match trim_type {
+ TrimType::Ltrim => format!("{spaces}{content}"),
+ TrimType::Rtrim => format!("{content}{spaces}"),
+ TrimType::Btrim => format!("{spaces}{content}{spaces}"),
+ };
+ Some(value)
+ }
+ });
+
+ let string_array: ArrayRef = match string_array_type {
+ StringArrayType::Utf8View =>
Arc::new(string_iter.collect::<StringViewArray>()),
+ StringArrayType::Utf8 =>
Arc::new(string_iter.collect::<StringArray>()),
+ StringArrayType::LargeUtf8 =>
Arc::new(string_iter.collect::<LargeStringArray>()),
+ };
+
+ vec![ColumnarValue::Array(string_array)]
+}
+
#[expect(clippy::too_many_arguments)]
fn run_with_string_type<M: Measurement>(
group: &mut BenchmarkGroup<'_, M>,
@@ -221,6 +260,60 @@ fn run_trim_benchmark(
group.finish();
}
+#[expect(clippy::too_many_arguments)]
+fn run_space_trim_benchmark(
+ c: &mut Criterion,
+ group_name: &str,
+ trim_func: &ScalarUDF,
+ trim_type: TrimType,
+ string_types: &[StringArrayType],
+ size: usize,
+ pad_len: usize,
+ remaining_len: usize,
+) {
+ let mut group = c.benchmark_group(group_name);
+ group.sampling_mode(SamplingMode::Flat);
+ group.sample_size(10);
+
+ let total_len = match trim_type {
+ TrimType::Btrim => 2 * pad_len + remaining_len,
+ _ => pad_len + remaining_len,
+ };
+
+ for string_type in string_types {
+ let args =
+ create_space_trim_args(size, pad_len, remaining_len, *string_type,
trim_type);
+ let arg_fields = args
+ .iter()
+ .enumerate()
+ .map(|(idx, arg)| {
+ Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+ })
+ .collect::<Vec<_>>();
+ let config_options = Arc::new(ConfigOptions::default());
+
+ group.bench_function(
+ format!(
+ "{trim_type} {string_type} [size={size}, len={total_len},
pad={pad_len}]",
+ ),
+ |b| {
+ b.iter(|| {
+ let args_cloned = args.clone();
+ black_box(trim_func.invoke_with_args(ScalarFunctionArgs {
+ args: args_cloned,
+ arg_fields: arg_fields.clone(),
+ number_rows: size,
+ return_field: Field::new("f", DataType::Utf8,
true).into(),
+ config_options: Arc::clone(&config_options),
+ }))
+ })
+ },
+ );
+ }
+
+ group.finish();
+}
+
fn criterion_benchmark(c: &mut Criterion) {
let ltrim = string::ltrim();
let rtrim = string::rtrim();
@@ -295,6 +388,45 @@ fn criterion_benchmark(c: &mut Criterion) {
&trimmed,
remaining_len,
);
+
+ // Scenario 4: Trim spaces, short strings (len <= 12)
+ // pad_len=4, remaining_len=8
+ run_space_trim_benchmark(
+ c,
+ "trim spaces, short strings (len <= 12)",
+ trim_func,
+ *trim_type,
+ &string_types,
+ size,
+ 4,
+ 8,
+ );
+
+ // Scenario 5: Trim spaces, long strings (len > 12)
+ // pad_len=4, remaining_len=60
+ run_space_trim_benchmark(
+ c,
+ "trim spaces, long strings",
+ trim_func,
+ *trim_type,
+ &string_types,
+ size,
+ 4,
+ 60,
+ );
+
+ // Scenario 6: Trim spaces, long strings, heavy padding
+ // pad_len=56, remaining_len=8
+ run_space_trim_benchmark(
+ c,
+ "trim spaces, heavy padding",
+ trim_func,
+ *trim_type,
+ &string_types,
+ size,
+ 56,
+ 8,
+ );
}
}
}
diff --git a/datafusion/functions/src/string/btrim.rs
b/datafusion/functions/src/string/btrim.rs
index 3ca5db3c49..beea527f6d 100644
--- a/datafusion/functions/src/string/btrim.rs
+++ b/datafusion/functions/src/string/btrim.rs
@@ -30,7 +30,7 @@ use datafusion_macros::user_doc;
use std::any::Any;
use std::sync::Arc;
-/// Returns the longest string with leading and trailing characters removed.
If the characters are not specified, whitespace is removed.
+/// Returns the longest string with leading and trailing characters removed.
If the characters are not specified, spaces are removed.
/// btrim('xyxtrimyyx', 'xyz') = 'trim'
fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -45,7 +45,7 @@ fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
#[user_doc(
doc_section(label = "String Functions"),
- description = "Trims the specified trim string from the start and end of a
string. If no trim string is provided, all whitespace is removed from the start
and end of the input string.",
+ description = "Trims the specified trim string from the start and end of a
string. If no trim string is provided, all spaces are removed from the start
and end of the input string.",
syntax_example = "btrim(str[, trim_str])",
sql_example = r#"```sql
> select btrim('__datafusion____', '_');
@@ -58,7 +58,7 @@ fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
standard_argument(name = "str", prefix = "String"),
argument(
name = "trim_str",
- description = r"String expression to operate on. Can be a constant,
column, or function, and any combination of operators. _Default is whitespace
characters._"
+ description = r"String expression to operate on. Can be a constant,
column, or function, and any combination of operators. _Default is a space._"
),
alternative_syntax = "trim(BOTH trim_str FROM str)",
alternative_syntax = "trim(trim_str FROM str)",
diff --git a/datafusion/functions/src/string/common.rs
b/datafusion/functions/src/string/common.rs
index 4a775c2744..77af82e25c 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -38,6 +38,22 @@ use datafusion_expr::ColumnarValue;
/// from the beginning of the input string where the trimmed result starts.
pub(crate) trait Trimmer {
fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32);
+
+ /// Optimized trim for a single ASCII byte.
+ /// Uses byte-level scanning instead of char-level iteration.
+ fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32);
+}
+
+/// Returns the number of leading bytes matching `byte`
+#[inline]
+fn leading_bytes(bytes: &[u8], byte: u8) -> usize {
+ bytes.iter().take_while(|&&b| b == byte).count()
+}
+
+/// Returns the number of trailing bytes matching `byte`
+#[inline]
+fn trailing_bytes(bytes: &[u8], byte: u8) -> usize {
+ bytes.iter().rev().take_while(|&&b| b == byte).count()
}
/// Left trim - removes leading characters
@@ -46,10 +62,19 @@ pub(crate) struct TrimLeft;
impl Trimmer for TrimLeft {
#[inline]
fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
+ if pattern.len() == 1 && pattern[0].is_ascii() {
+ return Self::trim_ascii_char(input, pattern[0] as u8);
+ }
let trimmed = input.trim_start_matches(pattern);
let offset = (input.len() - trimmed.len()) as u32;
(trimmed, offset)
}
+
+ #[inline]
+ fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
+ let start = leading_bytes(input.as_bytes(), byte);
+ (&input[start..], start as u32)
+ }
}
/// Right trim - removes trailing characters
@@ -58,9 +83,19 @@ pub(crate) struct TrimRight;
impl Trimmer for TrimRight {
#[inline]
fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
+ if pattern.len() == 1 && pattern[0].is_ascii() {
+ return Self::trim_ascii_char(input, pattern[0] as u8);
+ }
let trimmed = input.trim_end_matches(pattern);
(trimmed, 0)
}
+
+ #[inline]
+ fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
+ let bytes = input.as_bytes();
+ let end = bytes.len() - trailing_bytes(bytes, byte);
+ (&input[..end], 0)
+ }
}
/// Both trim - removes both leading and trailing characters
@@ -69,11 +104,22 @@ pub(crate) struct TrimBoth;
impl Trimmer for TrimBoth {
#[inline]
fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
+ if pattern.len() == 1 && pattern[0].is_ascii() {
+ return Self::trim_ascii_char(input, pattern[0] as u8);
+ }
let left_trimmed = input.trim_start_matches(pattern);
let offset = (input.len() - left_trimmed.len()) as u32;
let trimmed = left_trimmed.trim_end_matches(pattern);
(trimmed, offset)
}
+
+ #[inline]
+ fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
+ let bytes = input.as_bytes();
+ let start = leading_bytes(bytes, byte);
+ let end = bytes.len() - trailing_bytes(&bytes[start..], byte);
+ (&input[start..end], start as u32)
+ }
}
pub(crate) fn general_trim<T: OffsetSizeTrait, Tr: Trimmer>(
@@ -99,19 +145,24 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) ->
Result<ArrayRef> {
match args.len() {
1 => {
- // Default whitespace trim - pattern is just space
- let pattern = [' '];
+ // Trim spaces by default
for (src_str_opt, raw_view) in string_view_array
.iter()
.zip(string_view_array.views().iter())
{
- trim_and_append_view::<Tr>(
- src_str_opt,
- &pattern,
- &mut views_buf,
- &mut null_builder,
- raw_view,
- );
+ if let Some(src_str) = src_str_opt {
+ let (trimmed, offset) = Tr::trim_ascii_char(src_str, b' ');
+ make_and_append_view(
+ &mut views_buf,
+ &mut null_builder,
+ raw_view,
+ trimmed,
+ offset,
+ );
+ } else {
+ null_builder.append_null();
+ views_buf.push(0);
+ }
}
}
2 => {
@@ -141,6 +192,7 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) ->
Result<ArrayRef> {
}
} else {
// Per-row pattern - must compute pattern chars for each row
+ let mut pattern: Vec<char> = Vec::new();
for ((src_str_opt, raw_view), characters_opt) in
string_view_array
.iter()
.zip(string_view_array.views().iter())
@@ -149,7 +201,8 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) ->
Result<ArrayRef> {
if let (Some(src_str), Some(characters)) =
(src_str_opt, characters_opt)
{
- let pattern: Vec<char> = characters.chars().collect();
+ pattern.clear();
+ pattern.extend(characters.chars());
let (trimmed, offset) = Tr::trim(src_str, &pattern);
make_and_append_view(
&mut views_buf,
@@ -225,11 +278,10 @@ fn string_trim<T: OffsetSizeTrait, Tr: Trimmer>(args:
&[ArrayRef]) -> Result<Arr
match args.len() {
1 => {
- // Default whitespace trim - pattern is just space
- let pattern = [' '];
+ // Trim spaces by default
let result = string_array
.iter()
- .map(|string| string.map(|s| Tr::trim(s, &pattern).0))
+ .map(|string| string.map(|s| Tr::trim_ascii_char(s, b' ').0))
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
@@ -255,12 +307,14 @@ fn string_trim<T: OffsetSizeTrait, Tr: Trimmer>(args:
&[ArrayRef]) -> Result<Arr
}
// Per-row pattern - must compute pattern chars for each row
+ let mut pattern: Vec<char> = Vec::new();
let result = string_array
.iter()
.zip(characters_array.iter())
.map(|(string, characters)| match (string, characters) {
(Some(s), Some(c)) => {
- let pattern: Vec<char> = c.chars().collect();
+ pattern.clear();
+ pattern.extend(c.chars());
Some(Tr::trim(s, &pattern).0)
}
_ => None,
diff --git a/datafusion/functions/src/string/ltrim.rs
b/datafusion/functions/src/string/ltrim.rs
index abdf83e2d7..f84b273b8d 100644
--- a/datafusion/functions/src/string/ltrim.rs
+++ b/datafusion/functions/src/string/ltrim.rs
@@ -31,7 +31,7 @@ use datafusion_expr::{
};
use datafusion_macros::user_doc;
-/// Returns the longest string with leading characters removed. If the
characters are not specified, whitespace is removed.
+/// Returns the longest string with leading characters removed. If the
characters are not specified, spaces are removed.
/// ltrim('zzzytest', 'xyz') = 'test'
fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -46,7 +46,7 @@ fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
#[user_doc(
doc_section(label = "String Functions"),
- description = "Trims the specified trim string from the beginning of a
string. If no trim string is provided, all whitespace is removed from the start
of the input string.",
+ description = "Trims the specified trim string from the beginning of a
string. If no trim string is provided, spaces are removed from the start of the
input string.",
syntax_example = "ltrim(str[, trim_str])",
sql_example = r#"```sql
> select ltrim(' datafusion ');
@@ -65,7 +65,7 @@ fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
standard_argument(name = "str", prefix = "String"),
argument(
name = "trim_str",
- description = r"String expression to trim from the beginning of the
input string. Can be a constant, column, or function, and any combination of
arithmetic operators. _Default is whitespace characters._"
+ description = r"String expression to trim from the beginning of the
input string. Can be a constant, column, or function, and any combination of
arithmetic operators. _Default is a space._"
),
alternative_syntax = "trim(LEADING trim_str FROM str)",
related_udf(name = "btrim"),
diff --git a/datafusion/functions/src/string/rtrim.rs
b/datafusion/functions/src/string/rtrim.rs
index 0916c51479..5659d0acfd 100644
--- a/datafusion/functions/src/string/rtrim.rs
+++ b/datafusion/functions/src/string/rtrim.rs
@@ -31,7 +31,7 @@ use datafusion_expr::{
};
use datafusion_macros::user_doc;
-/// Returns the longest string with trailing characters removed. If the
characters are not specified, whitespace is removed.
+/// Returns the longest string with trailing characters removed. If the
characters are not specified, spaces are removed.
/// rtrim('testxxzx', 'xyz') = 'test'
fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -46,7 +46,7 @@ fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
#[user_doc(
doc_section(label = "String Functions"),
- description = "Trims the specified trim string from the end of a string.
If no trim string is provided, all whitespace is removed from the end of the
input string.",
+ description = "Trims the specified trim string from the end of a string.
If no trim string is provided, all spaces are removed from the end of the input
string.",
syntax_example = "rtrim(str[, trim_str])",
alternative_syntax = "trim(TRAILING trim_str FROM str)",
sql_example = r#"```sql
@@ -66,7 +66,7 @@ fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
standard_argument(name = "str", prefix = "String"),
argument(
name = "trim_str",
- description = "String expression to trim from the end of the input
string. Can be a constant, column, or function, and any combination of
arithmetic operators. _Default is whitespace characters._"
+ description = "String expression to trim from the end of the input
string. Can be a constant, column, or function, and any combination of
arithmetic operators. _Default is a space._"
),
related_udf(name = "btrim"),
related_udf(name = "ltrim")
diff --git a/datafusion/sqllogictest/test_files/functions.slt
b/datafusion/sqllogictest/test_files/functions.slt
index 35a32897d0..5a43d18e23 100644
--- a/datafusion/sqllogictest/test_files/functions.slt
+++ b/datafusion/sqllogictest/test_files/functions.slt
@@ -541,6 +541,15 @@ SELECT trim(arrow_cast(' foo ', 'Dictionary(Int32,
Utf8)'))
----
foo
+# Verify that trim, ltrim, and rtrim only strip spaces by default,
+# not other whitespace characters (tabs, newlines, etc.)
+query III
+SELECT length(trim(chr(9) || 'foo' || chr(10))),
+ length(ltrim(chr(9) || 'foo')),
+ length(rtrim('foo' || chr(10)))
+----
+5 4 4
+
query I
SELECT bit_length('foo')
----
diff --git a/docs/source/user-guide/sql/scalar_functions.md
b/docs/source/user-guide/sql/scalar_functions.md
index 9e0a56cd03..78d13066d9 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -1225,7 +1225,7 @@ bit_length(str)
### `btrim`
-Trims the specified trim string from the start and end of a string. If no trim
string is provided, all whitespace is removed from the start and end of the
input string.
+Trims the specified trim string from the start and end of a string. If no trim
string is provided, all spaces are removed from the start and end of the input
string.
```sql
btrim(str[, trim_str])
@@ -1234,7 +1234,7 @@ btrim(str[, trim_str])
#### Arguments
- **str**: String expression to operate on. Can be a constant, column, or
function, and any combination of operators.
-- **trim_str**: String expression to operate on. Can be a constant, column, or
function, and any combination of operators. _Default is whitespace characters._
+- **trim_str**: String expression to operate on. Can be a constant, column, or
function, and any combination of operators. _Default is a space._
#### Example
@@ -1612,7 +1612,7 @@ lpad(str, n[, padding_str])
### `ltrim`
-Trims the specified trim string from the beginning of a string. If no trim
string is provided, all whitespace is removed from the start of the input
string.
+Trims the specified trim string from the beginning of a string. If no trim
string is provided, spaces are removed from the start of the input string.
```sql
ltrim(str[, trim_str])
@@ -1621,7 +1621,7 @@ ltrim(str[, trim_str])
#### Arguments
- **str**: String expression to operate on. Can be a constant, column, or
function, and any combination of operators.
-- **trim_str**: String expression to trim from the beginning of the input
string. Can be a constant, column, or function, and any combination of
arithmetic operators. _Default is whitespace characters._
+- **trim_str**: String expression to trim from the beginning of the input
string. Can be a constant, column, or function, and any combination of
arithmetic operators. _Default is a space._
#### Example
@@ -1840,7 +1840,7 @@ rpad(str, n[, padding_str])
### `rtrim`
-Trims the specified trim string from the end of a string. If no trim string is
provided, all whitespace is removed from the end of the input string.
+Trims the specified trim string from the end of a string. If no trim string is
provided, all spaces are removed from the end of the input string.
```sql
rtrim(str[, trim_str])
@@ -1849,7 +1849,7 @@ rtrim(str[, trim_str])
#### Arguments
- **str**: String expression to operate on. Can be a constant, column, or
function, and any combination of operators.
-- **trim_str**: String expression to trim from the end of the input string.
Can be a constant, column, or function, and any combination of arithmetic
operators. _Default is whitespace characters._
+- **trim_str**: String expression to trim from the end of the input string.
Can be a constant, column, or function, and any combination of arithmetic
operators. _Default is a space._
#### Example
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]