This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new ace9cd44b7 perf: Optimize trim UDFs for single-character trims (#20328)
ace9cd44b7 is described below

commit ace9cd44b7356d60e6d69d0b98ac3f5606d55507
Author: Neil Conway <[email protected]>
AuthorDate: Fri Feb 20 04:28:53 2026 -0500

    perf: Optimize trim UDFs for single-character trims (#20328)
    
    ## Which issue does this PR close?
    
    - Closes #20327
    
    ## Rationale for this change
    
    By default, btrim(), ltrim(), and rtrim() trim space characters; it is
    also reasonably common for queries to specify a non-default trim pattern
    that is still a single ASCII character.
    
    We can optimize for this case by doing a byte-level scan, rather than
    invoking the more heavyweight std::string machinery used for more
    complex trim scenarios.
    
    ## What changes are included in this PR?
    
    Add a benchmark for trimming spaces, and implement the optimization
    described above. Also fixed an error in the documentation.
    
    ## Are these changes tested?
    
    Yes, and benchmarked.
    
    ## Are there any user-facing changes?
    
    No.
    
    ---------
    
    Co-authored-by: Martin Grigorov <[email protected]>
---
 datafusion/functions/benches/trim.rs             | 132 +++++++++++++++++++++++
 datafusion/functions/src/string/btrim.rs         |   6 +-
 datafusion/functions/src/string/common.rs        |  82 +++++++++++---
 datafusion/functions/src/string/ltrim.rs         |   6 +-
 datafusion/functions/src/string/rtrim.rs         |   6 +-
 datafusion/sqllogictest/test_files/functions.slt |   9 ++
 docs/source/user-guide/sql/scalar_functions.md   |  12 +--
 7 files changed, 224 insertions(+), 29 deletions(-)

diff --git a/datafusion/functions/benches/trim.rs 
b/datafusion/functions/benches/trim.rs
index 23a53eefb2..21d99592d1 100644
--- a/datafusion/functions/benches/trim.rs
+++ b/datafusion/functions/benches/trim.rs
@@ -141,6 +141,45 @@ fn create_args(
     ]
 }
 
+/// Create args for trim benchmark where space characters are being trimmed
+fn create_space_trim_args(
+    size: usize,
+    pad_len: usize,
+    remaining_len: usize,
+    string_array_type: StringArrayType,
+    trim_type: TrimType,
+) -> Vec<ColumnarValue> {
+    let rng = &mut StdRng::seed_from_u64(42);
+    let spaces = " ".repeat(pad_len);
+
+    let string_iter = (0..size).map(|_| {
+        if rng.random::<f32>() < 0.1 {
+            None
+        } else {
+            let content: String = rng
+                .sample_iter(&Alphanumeric)
+                .take(remaining_len)
+                .map(char::from)
+                .collect();
+
+            let value = match trim_type {
+                TrimType::Ltrim => format!("{spaces}{content}"),
+                TrimType::Rtrim => format!("{content}{spaces}"),
+                TrimType::Btrim => format!("{spaces}{content}{spaces}"),
+            };
+            Some(value)
+        }
+    });
+
+    let string_array: ArrayRef = match string_array_type {
+        StringArrayType::Utf8View => 
Arc::new(string_iter.collect::<StringViewArray>()),
+        StringArrayType::Utf8 => 
Arc::new(string_iter.collect::<StringArray>()),
+        StringArrayType::LargeUtf8 => 
Arc::new(string_iter.collect::<LargeStringArray>()),
+    };
+
+    vec![ColumnarValue::Array(string_array)]
+}
+
 #[expect(clippy::too_many_arguments)]
 fn run_with_string_type<M: Measurement>(
     group: &mut BenchmarkGroup<'_, M>,
@@ -221,6 +260,60 @@ fn run_trim_benchmark(
     group.finish();
 }
 
+#[expect(clippy::too_many_arguments)]
+fn run_space_trim_benchmark(
+    c: &mut Criterion,
+    group_name: &str,
+    trim_func: &ScalarUDF,
+    trim_type: TrimType,
+    string_types: &[StringArrayType],
+    size: usize,
+    pad_len: usize,
+    remaining_len: usize,
+) {
+    let mut group = c.benchmark_group(group_name);
+    group.sampling_mode(SamplingMode::Flat);
+    group.sample_size(10);
+
+    let total_len = match trim_type {
+        TrimType::Btrim => 2 * pad_len + remaining_len,
+        _ => pad_len + remaining_len,
+    };
+
+    for string_type in string_types {
+        let args =
+            create_space_trim_args(size, pad_len, remaining_len, *string_type, 
trim_type);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
+
+        group.bench_function(
+            format!(
+                "{trim_type} {string_type} [size={size}, len={total_len}, 
pad={pad_len}]",
+            ),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(trim_func.invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, 
true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
     let ltrim = string::ltrim();
     let rtrim = string::rtrim();
@@ -295,6 +388,45 @@ fn criterion_benchmark(c: &mut Criterion) {
                 &trimmed,
                 remaining_len,
             );
+
+            // Scenario 4: Trim spaces, short strings (len <= 12)
+            // pad_len=4, remaining_len=8
+            run_space_trim_benchmark(
+                c,
+                "trim spaces, short strings (len <= 12)",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                4,
+                8,
+            );
+
+            // Scenario 5: Trim spaces, long strings (len > 12)
+            // pad_len=4, remaining_len=60
+            run_space_trim_benchmark(
+                c,
+                "trim spaces, long strings",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                4,
+                60,
+            );
+
+            // Scenario 6: Trim spaces, long strings, heavy padding
+            // pad_len=56, remaining_len=8
+            run_space_trim_benchmark(
+                c,
+                "trim spaces, heavy padding",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                56,
+                8,
+            );
         }
     }
 }
diff --git a/datafusion/functions/src/string/btrim.rs 
b/datafusion/functions/src/string/btrim.rs
index 3ca5db3c49..beea527f6d 100644
--- a/datafusion/functions/src/string/btrim.rs
+++ b/datafusion/functions/src/string/btrim.rs
@@ -30,7 +30,7 @@ use datafusion_macros::user_doc;
 use std::any::Any;
 use std::sync::Arc;
 
-/// Returns the longest string with leading and trailing characters removed. 
If the characters are not specified, whitespace is removed.
+/// Returns the longest string with leading and trailing characters removed. 
If the characters are not specified, spaces are removed.
 /// btrim('xyxtrimyyx', 'xyz') = 'trim'
 fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -45,7 +45,7 @@ fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Trims the specified trim string from the start and end of a 
string. If no trim string is provided, all whitespace is removed from the start 
and end of the input string.",
+    description = "Trims the specified trim string from the start and end of a 
string. If no trim string is provided, all spaces are removed from the start 
and end of the input string.",
     syntax_example = "btrim(str[, trim_str])",
     sql_example = r#"```sql
 > select btrim('__datafusion____', '_');
@@ -58,7 +58,7 @@ fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     standard_argument(name = "str", prefix = "String"),
     argument(
         name = "trim_str",
-        description = r"String expression to operate on. Can be a constant, 
column, or function, and any combination of operators. _Default is whitespace 
characters._"
+        description = r"String expression to operate on. Can be a constant, 
column, or function, and any combination of operators. _Default is a space._"
     ),
     alternative_syntax = "trim(BOTH trim_str FROM str)",
     alternative_syntax = "trim(trim_str FROM str)",
diff --git a/datafusion/functions/src/string/common.rs 
b/datafusion/functions/src/string/common.rs
index 4a775c2744..77af82e25c 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -38,6 +38,22 @@ use datafusion_expr::ColumnarValue;
 /// from the beginning of the input string where the trimmed result starts.
 pub(crate) trait Trimmer {
     fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32);
+
+    /// Optimized trim for a single ASCII byte.
+    /// Uses byte-level scanning instead of char-level iteration.
+    fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32);
+}
+
+/// Returns the number of leading bytes matching `byte`
+#[inline]
+fn leading_bytes(bytes: &[u8], byte: u8) -> usize {
+    bytes.iter().take_while(|&&b| b == byte).count()
+}
+
+/// Returns the number of trailing bytes matching `byte`
+#[inline]
+fn trailing_bytes(bytes: &[u8], byte: u8) -> usize {
+    bytes.iter().rev().take_while(|&&b| b == byte).count()
 }
 
 /// Left trim - removes leading characters
@@ -46,10 +62,19 @@ pub(crate) struct TrimLeft;
 impl Trimmer for TrimLeft {
     #[inline]
     fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
+        if pattern.len() == 1 && pattern[0].is_ascii() {
+            return Self::trim_ascii_char(input, pattern[0] as u8);
+        }
         let trimmed = input.trim_start_matches(pattern);
         let offset = (input.len() - trimmed.len()) as u32;
         (trimmed, offset)
     }
+
+    #[inline]
+    fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
+        let start = leading_bytes(input.as_bytes(), byte);
+        (&input[start..], start as u32)
+    }
 }
 
 /// Right trim - removes trailing characters
@@ -58,9 +83,19 @@ pub(crate) struct TrimRight;
 impl Trimmer for TrimRight {
     #[inline]
     fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
+        if pattern.len() == 1 && pattern[0].is_ascii() {
+            return Self::trim_ascii_char(input, pattern[0] as u8);
+        }
         let trimmed = input.trim_end_matches(pattern);
         (trimmed, 0)
     }
+
+    #[inline]
+    fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
+        let bytes = input.as_bytes();
+        let end = bytes.len() - trailing_bytes(bytes, byte);
+        (&input[..end], 0)
+    }
 }
 
 /// Both trim - removes both leading and trailing characters
@@ -69,11 +104,22 @@ pub(crate) struct TrimBoth;
 impl Trimmer for TrimBoth {
     #[inline]
     fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
+        if pattern.len() == 1 && pattern[0].is_ascii() {
+            return Self::trim_ascii_char(input, pattern[0] as u8);
+        }
         let left_trimmed = input.trim_start_matches(pattern);
         let offset = (input.len() - left_trimmed.len()) as u32;
         let trimmed = left_trimmed.trim_end_matches(pattern);
         (trimmed, offset)
     }
+
+    #[inline]
+    fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
+        let bytes = input.as_bytes();
+        let start = leading_bytes(bytes, byte);
+        let end = bytes.len() - trailing_bytes(&bytes[start..], byte);
+        (&input[start..end], start as u32)
+    }
 }
 
 pub(crate) fn general_trim<T: OffsetSizeTrait, Tr: Trimmer>(
@@ -99,19 +145,24 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
 
     match args.len() {
         1 => {
-            // Default whitespace trim - pattern is just space
-            let pattern = [' '];
+            // Trim spaces by default
             for (src_str_opt, raw_view) in string_view_array
                 .iter()
                 .zip(string_view_array.views().iter())
             {
-                trim_and_append_view::<Tr>(
-                    src_str_opt,
-                    &pattern,
-                    &mut views_buf,
-                    &mut null_builder,
-                    raw_view,
-                );
+                if let Some(src_str) = src_str_opt {
+                    let (trimmed, offset) = Tr::trim_ascii_char(src_str, b' ');
+                    make_and_append_view(
+                        &mut views_buf,
+                        &mut null_builder,
+                        raw_view,
+                        trimmed,
+                        offset,
+                    );
+                } else {
+                    null_builder.append_null();
+                    views_buf.push(0);
+                }
             }
         }
         2 => {
@@ -141,6 +192,7 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
                 }
             } else {
                 // Per-row pattern - must compute pattern chars for each row
+                let mut pattern: Vec<char> = Vec::new();
                 for ((src_str_opt, raw_view), characters_opt) in 
string_view_array
                     .iter()
                     .zip(string_view_array.views().iter())
@@ -149,7 +201,8 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
                     if let (Some(src_str), Some(characters)) =
                         (src_str_opt, characters_opt)
                     {
-                        let pattern: Vec<char> = characters.chars().collect();
+                        pattern.clear();
+                        pattern.extend(characters.chars());
                         let (trimmed, offset) = Tr::trim(src_str, &pattern);
                         make_and_append_view(
                             &mut views_buf,
@@ -225,11 +278,10 @@ fn string_trim<T: OffsetSizeTrait, Tr: Trimmer>(args: 
&[ArrayRef]) -> Result<Arr
 
     match args.len() {
         1 => {
-            // Default whitespace trim - pattern is just space
-            let pattern = [' '];
+            // Trim spaces by default
             let result = string_array
                 .iter()
-                .map(|string| string.map(|s| Tr::trim(s, &pattern).0))
+                .map(|string| string.map(|s| Tr::trim_ascii_char(s, b' ').0))
                 .collect::<GenericStringArray<T>>();
 
             Ok(Arc::new(result) as ArrayRef)
@@ -255,12 +307,14 @@ fn string_trim<T: OffsetSizeTrait, Tr: Trimmer>(args: 
&[ArrayRef]) -> Result<Arr
             }
 
             // Per-row pattern - must compute pattern chars for each row
+            let mut pattern: Vec<char> = Vec::new();
             let result = string_array
                 .iter()
                 .zip(characters_array.iter())
                 .map(|(string, characters)| match (string, characters) {
                     (Some(s), Some(c)) => {
-                        let pattern: Vec<char> = c.chars().collect();
+                        pattern.clear();
+                        pattern.extend(c.chars());
                         Some(Tr::trim(s, &pattern).0)
                     }
                     _ => None,
diff --git a/datafusion/functions/src/string/ltrim.rs 
b/datafusion/functions/src/string/ltrim.rs
index abdf83e2d7..f84b273b8d 100644
--- a/datafusion/functions/src/string/ltrim.rs
+++ b/datafusion/functions/src/string/ltrim.rs
@@ -31,7 +31,7 @@ use datafusion_expr::{
 };
 use datafusion_macros::user_doc;
 
-/// Returns the longest string  with leading characters removed. If the 
characters are not specified, whitespace is removed.
+/// Returns the longest string with leading characters removed. If the 
characters are not specified, spaces are removed.
 /// ltrim('zzzytest', 'xyz') = 'test'
 fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -46,7 +46,7 @@ fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Trims the specified trim string from the beginning of a 
string. If no trim string is provided, all whitespace is removed from the start 
of the input string.",
+    description = "Trims the specified trim string from the beginning of a 
string. If no trim string is provided, spaces are removed from the start of the 
input string.",
     syntax_example = "ltrim(str[, trim_str])",
     sql_example = r#"```sql
 > select ltrim('  datafusion  ');
@@ -65,7 +65,7 @@ fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     standard_argument(name = "str", prefix = "String"),
     argument(
         name = "trim_str",
-        description = r"String expression to trim from the beginning of the 
input string. Can be a constant, column, or function, and any combination of 
arithmetic operators. _Default is whitespace characters._"
+        description = r"String expression to trim from the beginning of the 
input string. Can be a constant, column, or function, and any combination of 
arithmetic operators. _Default is a space._"
     ),
     alternative_syntax = "trim(LEADING trim_str FROM str)",
     related_udf(name = "btrim"),
diff --git a/datafusion/functions/src/string/rtrim.rs 
b/datafusion/functions/src/string/rtrim.rs
index 0916c51479..5659d0acfd 100644
--- a/datafusion/functions/src/string/rtrim.rs
+++ b/datafusion/functions/src/string/rtrim.rs
@@ -31,7 +31,7 @@ use datafusion_expr::{
 };
 use datafusion_macros::user_doc;
 
-/// Returns the longest string  with trailing characters removed. If the 
characters are not specified, whitespace is removed.
+/// Returns the longest string with trailing characters removed. If the 
characters are not specified, spaces are removed.
 /// rtrim('testxxzx', 'xyz') = 'test'
 fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -46,7 +46,7 @@ fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Trims the specified trim string from the end of a string. 
If no trim string is provided, all whitespace is removed from the end of the 
input string.",
+    description = "Trims the specified trim string from the end of a string. 
If no trim string is provided, all spaces are removed from the end of the input 
string.",
     syntax_example = "rtrim(str[, trim_str])",
     alternative_syntax = "trim(TRAILING trim_str FROM str)",
     sql_example = r#"```sql
@@ -66,7 +66,7 @@ fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     standard_argument(name = "str", prefix = "String"),
     argument(
         name = "trim_str",
-        description = "String expression to trim from the end of the input 
string. Can be a constant, column, or function, and any combination of 
arithmetic operators. _Default is whitespace characters._"
+        description = "String expression to trim from the end of the input 
string. Can be a constant, column, or function, and any combination of 
arithmetic operators. _Default is a space._"
     ),
     related_udf(name = "btrim"),
     related_udf(name = "ltrim")
diff --git a/datafusion/sqllogictest/test_files/functions.slt 
b/datafusion/sqllogictest/test_files/functions.slt
index 35a32897d0..5a43d18e23 100644
--- a/datafusion/sqllogictest/test_files/functions.slt
+++ b/datafusion/sqllogictest/test_files/functions.slt
@@ -541,6 +541,15 @@ SELECT trim(arrow_cast('  foo  ', 'Dictionary(Int32, 
Utf8)'))
 ----
 foo
 
+# Verify that trim, ltrim, and rtrim only strip spaces by default,
+# not other whitespace characters (tabs, newlines, etc.)
+query III
+SELECT length(trim(chr(9) || 'foo' || chr(10))),
+       length(ltrim(chr(9) || 'foo')),
+       length(rtrim('foo' || chr(10)))
+----
+5 4 4
+
 query I
 SELECT bit_length('foo')
 ----
diff --git a/docs/source/user-guide/sql/scalar_functions.md 
b/docs/source/user-guide/sql/scalar_functions.md
index 9e0a56cd03..78d13066d9 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -1225,7 +1225,7 @@ bit_length(str)
 
 ### `btrim`
 
-Trims the specified trim string from the start and end of a string. If no trim 
string is provided, all whitespace is removed from the start and end of the 
input string.
+Trims the specified trim string from the start and end of a string. If no trim 
string is provided, all spaces are removed from the start and end of the input 
string.
 
 ```sql
 btrim(str[, trim_str])
@@ -1234,7 +1234,7 @@ btrim(str[, trim_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or 
function, and any combination of operators.
-- **trim_str**: String expression to operate on. Can be a constant, column, or 
function, and any combination of operators. _Default is whitespace characters._
+- **trim_str**: String expression to operate on. Can be a constant, column, or 
function, and any combination of operators. _Default is a space._
 
 #### Example
 
@@ -1612,7 +1612,7 @@ lpad(str, n[, padding_str])
 
 ### `ltrim`
 
-Trims the specified trim string from the beginning of a string. If no trim 
string is provided, all whitespace is removed from the start of the input 
string.
+Trims the specified trim string from the beginning of a string. If no trim 
string is provided, spaces are removed from the start of the input string.
 
 ```sql
 ltrim(str[, trim_str])
@@ -1621,7 +1621,7 @@ ltrim(str[, trim_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or 
function, and any combination of operators.
-- **trim_str**: String expression to trim from the beginning of the input 
string. Can be a constant, column, or function, and any combination of 
arithmetic operators. _Default is whitespace characters._
+- **trim_str**: String expression to trim from the beginning of the input 
string. Can be a constant, column, or function, and any combination of 
arithmetic operators. _Default is a space._
 
 #### Example
 
@@ -1840,7 +1840,7 @@ rpad(str, n[, padding_str])
 
 ### `rtrim`
 
-Trims the specified trim string from the end of a string. If no trim string is 
provided, all whitespace is removed from the end of the input string.
+Trims the specified trim string from the end of a string. If no trim string is 
provided, all spaces are removed from the end of the input string.
 
 ```sql
 rtrim(str[, trim_str])
@@ -1849,7 +1849,7 @@ rtrim(str[, trim_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or 
function, and any combination of operators.
-- **trim_str**: String expression to trim from the end of the input string. 
Can be a constant, column, or function, and any combination of arithmetic 
operators. _Default is whitespace characters._
+- **trim_str**: String expression to trim from the end of the input string. 
Can be a constant, column, or function, and any combination of arithmetic 
operators. _Default is a space._
 
 #### Example
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to