Re: [PR] perf: implement native Rust trim functions for better performance [datafusion-comet]

via GitHub Fri, 26 Dec 2025 09:52:00 -0800


Copilot commented on code in PR #2988:
URL: https://github.com/apache/datafusion-comet/pull/2988#discussion_r2648502551



##########
native/spark-expr/src/string_funcs/trim.rs:
##########
@@ -0,0 +1,319 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+use datafusion::common::{Result as DataFusionResult, ScalarValue};
+use datafusion::logical_expr::ColumnarValue;
+use std::sync::Arc;
+
+/// Trims whitespace from both ends of a string (Spark's TRIM function)
+pub fn spark_trim(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+    if args.len() != 1 {
+        return Err(datafusion::common::DataFusionError::Execution(
+            format!("trim expects 1 argument, got {}", args.len()),
+        ));
+    }
+
+    match &args[0] {
+        ColumnarValue::Array(array) => {
+            let result = trim_array(array, TrimType::Both)?;
+            Ok(ColumnarValue::Array(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                s.trim().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(
+                s.trim().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+        }
+        _ => Err(datafusion::common::DataFusionError::Execution(
+            "trim expects string argument".to_string(),
+        )),
+    }
+}

Review Comment:
   The implementation only handles the single-argument form of TRIM, but 
Spark's TRIM function supports an optional second parameter to specify custom 
characters to trim (e.g., `TRIM('SL' FROM col)` or `TRIM(BOTH 'SL' FROM col)`). 
   
   According to the test in `CometStringExpressionSuite.scala` line 249, 
queries like `SELECT trim('SL', col) FROM table` are expected to work. The 
current implementation will reject these with an argument count error.
   
   The implementation should support both:
   1. `trim(string)` - trim whitespace (current implementation)
   2. `trim(trimChars, string)` - trim specific characters
   
   This applies to all trim variants (trim, btrim, ltrim, rtrim). Consider 
either:
   - Extending this implementation to support both argument counts
   - Documenting that this is a partial implementation and updating 
tests/documentation accordingly



##########
native/spark-expr/src/string_funcs/trim.rs:
##########
@@ -0,0 +1,319 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+use datafusion::common::{Result as DataFusionResult, ScalarValue};
+use datafusion::logical_expr::ColumnarValue;
+use std::sync::Arc;
+
+/// Trims whitespace from both ends of a string (Spark's TRIM function)
+pub fn spark_trim(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+    if args.len() != 1 {
+        return Err(datafusion::common::DataFusionError::Execution(
+            format!("trim expects 1 argument, got {}", args.len()),
+        ));
+    }
+
+    match &args[0] {
+        ColumnarValue::Array(array) => {
+            let result = trim_array(array, TrimType::Both)?;
+            Ok(ColumnarValue::Array(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                s.trim().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(
+                s.trim().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+        }
+        _ => Err(datafusion::common::DataFusionError::Execution(
+            "trim expects string argument".to_string(),
+        )),
+    }
+}
+
+/// Trims whitespace from the left side of a string (Spark's LTRIM function)
+pub fn spark_ltrim(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+    if args.len() != 1 {
+        return Err(datafusion::common::DataFusionError::Execution(
+            format!("ltrim expects 1 argument, got {}", args.len()),
+        ));
+    }
+
+    match &args[0] {
+        ColumnarValue::Array(array) => {
+            let result = trim_array(array, TrimType::Left)?;
+            Ok(ColumnarValue::Array(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                s.trim_start().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(
+                s.trim_start().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+        }
+        _ => Err(datafusion::common::DataFusionError::Execution(
+            "ltrim expects string argument".to_string(),
+        )),
+    }
+}
+
+/// Trims whitespace from the right side of a string (Spark's RTRIM function)
+pub fn spark_rtrim(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+    if args.len() != 1 {
+        return Err(datafusion::common::DataFusionError::Execution(
+            format!("rtrim expects 1 argument, got {}", args.len()),
+        ));
+    }
+
+    match &args[0] {
+        ColumnarValue::Array(array) => {
+            let result = trim_array(array, TrimType::Right)?;
+            Ok(ColumnarValue::Array(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                s.trim_end().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(
+                s.trim_end().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+        }
+        _ => Err(datafusion::common::DataFusionError::Execution(
+            "rtrim expects string argument".to_string(),
+        )),
+    }
+}
+
+/// Trims whitespace from both ends of a string (alias for trim, Spark's BTRIM 
function)
+pub fn spark_btrim(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+    spark_trim(args)
+}
+
+#[derive(Debug, Clone, Copy)]
+enum TrimType {
+    Left,
+    Right,
+    Both,
+}
+
+/// Generic function to trim string arrays
+fn trim_array(array: &ArrayRef, trim_type: TrimType) -> 
DataFusionResult<ArrayRef> {
+    let data_type = array.data_type();
+    
+    match data_type {
+        DataType::Utf8 => {
+            let string_array = array
+                .as_any()
+                .downcast_ref::<arrow::array::StringArray>()
+                .ok_or_else(|| {
+                    datafusion::common::DataFusionError::Execution(
+                        "Failed to downcast to StringArray".to_string(),
+                    )
+                })?;
+            Ok(Arc::new(trim_string_array(string_array, trim_type)))
+        }
+        DataType::LargeUtf8 => {
+            let string_array = array
+                .as_any()
+                .downcast_ref::<arrow::array::LargeStringArray>()
+                .ok_or_else(|| {
+                    datafusion::common::DataFusionError::Execution(
+                        "Failed to downcast to LargeStringArray".to_string(),
+                    )
+                })?;
+            Ok(Arc::new(trim_string_array(string_array, trim_type)))
+        }
+        _ => Err(datafusion::common::DataFusionError::Execution(format!(
+            "trim expects string type, got {:?}",
+            data_type
+        ))),
+    }
+}
+
+/// Optimized trim implementation for GenericStringArray
+fn trim_string_array<O: OffsetSizeTrait>(
+    array: &GenericStringArray<O>,
+    trim_type: TrimType,
+) -> GenericStringArray<O> {
+    // Fast path: Check if any strings actually need trimming
+    // If not, we can return a clone of the original array
+    let needs_trimming = (0..array.len()).any(|i| {
+        if array.is_null(i) {
+            false
+        } else {
+            let s = array.value(i);
+            match trim_type {
+                TrimType::Left => s.starts_with(|c: char| c.is_whitespace()),
+                TrimType::Right => s.ends_with(|c: char| c.is_whitespace()),
+                TrimType::Both => {
+                    s.starts_with(|c: char| c.is_whitespace())
+                        || s.ends_with(|c: char| c.is_whitespace())
+                }
+            }
+        }
+    });
+
+    if !needs_trimming {
+        // No trimming needed, return a clone of the input

Review Comment:
   The fast-path optimization iterates through the entire array to check if any 
string needs trimming, then potentially iterates through it again to perform 
the trimming. This double iteration may actually hurt performance for arrays 
where most strings need trimming, which is likely a common case.
   
   For example, if all strings in an array have leading/trailing whitespace (a 
typical scenario), this optimization will:
   1. Iterate through all strings to check if trimming is needed
   2. Then iterate through all strings again to perform the trimming
   
   This doubles the iteration cost. Consider removing this optimization or 
adding a short-circuit mechanism that stops checking after finding a certain 
number of strings that need trimming (e.g., if the first N strings need 
trimming, assume the rest might too).
   ```suggestion
       // Fast path: Check if any strings actually need trimming.
       // If not, we can return a clone of the original array.
       // To avoid a full double scan, we bound the pre-scan: once we've
       // seen enough strings that need trimming, we stop checking and go
       // straight to building the trimmed array.
       const MAX_FAST_PATH_CHECK: usize = 16;
   
       let mut needs_trimming = false;
       let mut fully_scanned = true;
   
       for i in 0..array.len() {
           if array.is_null(i) {
               continue;
           }
   
           let s = array.value(i);
           let string_needs_trimming = match trim_type {
               TrimType::Left => s.starts_with(|c: char| c.is_whitespace()),
               TrimType::Right => s.ends_with(|c: char| c.is_whitespace()),
               TrimType::Both => {
                   s.starts_with(|c: char| c.is_whitespace())
                       || s.ends_with(|c: char| c.is_whitespace())
               }
           };
   
           if string_needs_trimming {
               needs_trimming = true;
               if i + 1 >= MAX_FAST_PATH_CHECK {
                   // We've already seen enough evidence that trimming is 
needed.
                   // Avoid scanning the rest of the array just for the fast 
path.
                   fully_scanned = false;
                   break;
               }
           }
       }
   
       if !needs_trimming && fully_scanned {
           // No trimming needed for any element, return a clone of the input.
   ```



##########
native/spark-expr/src/string_funcs/trim.rs:
##########
@@ -0,0 +1,319 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::datatypes::DataType;
+use datafusion::common::{Result as DataFusionResult, ScalarValue};
+use datafusion::logical_expr::ColumnarValue;
+use std::sync::Arc;
+
+/// Trims whitespace from both ends of a string (Spark's TRIM function)
+pub fn spark_trim(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+    if args.len() != 1 {
+        return Err(datafusion::common::DataFusionError::Execution(
+            format!("trim expects 1 argument, got {}", args.len()),
+        ));
+    }
+
+    match &args[0] {
+        ColumnarValue::Array(array) => {
+            let result = trim_array(array, TrimType::Both)?;
+            Ok(ColumnarValue::Array(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                s.trim().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(
+                s.trim().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+        }
+        _ => Err(datafusion::common::DataFusionError::Execution(
+            "trim expects string argument".to_string(),
+        )),
+    }
+}
+
+/// Trims whitespace from the left side of a string (Spark's LTRIM function)
+pub fn spark_ltrim(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+    if args.len() != 1 {
+        return Err(datafusion::common::DataFusionError::Execution(
+            format!("ltrim expects 1 argument, got {}", args.len()),
+        ));
+    }
+
+    match &args[0] {
+        ColumnarValue::Array(array) => {
+            let result = trim_array(array, TrimType::Left)?;
+            Ok(ColumnarValue::Array(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                s.trim_start().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(
+                s.trim_start().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+        }
+        _ => Err(datafusion::common::DataFusionError::Execution(
+            "ltrim expects string argument".to_string(),
+        )),
+    }
+}
+
+/// Trims whitespace from the right side of a string (Spark's RTRIM function)
+pub fn spark_rtrim(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+    if args.len() != 1 {
+        return Err(datafusion::common::DataFusionError::Execution(
+            format!("rtrim expects 1 argument, got {}", args.len()),
+        ));
+    }
+
+    match &args[0] {
+        ColumnarValue::Array(array) => {
+            let result = trim_array(array, TrimType::Right)?;
+            Ok(ColumnarValue::Array(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                s.trim_end().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(s))) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(
+                s.trim_end().to_string(),
+            ))))
+        }
+        ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+        }
+        _ => Err(datafusion::common::DataFusionError::Execution(
+            "rtrim expects string argument".to_string(),
+        )),
+    }
+}
+
+/// Trims whitespace from both ends of a string (alias for trim, Spark's BTRIM 
function)
+pub fn spark_btrim(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+    spark_trim(args)
+}
+
+#[derive(Debug, Clone, Copy)]
+enum TrimType {
+    Left,
+    Right,
+    Both,
+}
+
+/// Generic function to trim string arrays
+fn trim_array(array: &ArrayRef, trim_type: TrimType) -> 
DataFusionResult<ArrayRef> {
+    let data_type = array.data_type();
+    
+    match data_type {
+        DataType::Utf8 => {
+            let string_array = array
+                .as_any()
+                .downcast_ref::<arrow::array::StringArray>()
+                .ok_or_else(|| {
+                    datafusion::common::DataFusionError::Execution(
+                        "Failed to downcast to StringArray".to_string(),
+                    )
+                })?;
+            Ok(Arc::new(trim_string_array(string_array, trim_type)))
+        }
+        DataType::LargeUtf8 => {
+            let string_array = array
+                .as_any()
+                .downcast_ref::<arrow::array::LargeStringArray>()
+                .ok_or_else(|| {
+                    datafusion::common::DataFusionError::Execution(
+                        "Failed to downcast to LargeStringArray".to_string(),
+                    )
+                })?;
+            Ok(Arc::new(trim_string_array(string_array, trim_type)))
+        }
+        _ => Err(datafusion::common::DataFusionError::Execution(format!(
+            "trim expects string type, got {:?}",
+            data_type
+        ))),
+    }
+}
+
+/// Optimized trim implementation for GenericStringArray
+fn trim_string_array<O: OffsetSizeTrait>(
+    array: &GenericStringArray<O>,
+    trim_type: TrimType,
+) -> GenericStringArray<O> {
+    // Fast path: Check if any strings actually need trimming
+    // If not, we can return a clone of the original array
+    let needs_trimming = (0..array.len()).any(|i| {
+        if array.is_null(i) {
+            false
+        } else {
+            let s = array.value(i);
+            match trim_type {
+                TrimType::Left => s.starts_with(|c: char| c.is_whitespace()),
+                TrimType::Right => s.ends_with(|c: char| c.is_whitespace()),
+                TrimType::Both => {
+                    s.starts_with(|c: char| c.is_whitespace())
+                        || s.ends_with(|c: char| c.is_whitespace())
+                }
+            }
+        }
+    });
+
+    if !needs_trimming {
+        // No trimming needed, return a clone of the input
+        return array.clone();
+    }
+
+    // Slow path: Build new array with trimmed strings
+    let mut builder = arrow::array::GenericStringBuilder::<O>::with_capacity(
+        array.len(),
+        array.get_buffer_memory_size(),
+    );
+
+    for i in 0..array.len() {
+        if array.is_null(i) {
+            builder.append_null();
+        } else {
+            let s = array.value(i);
+            let trimmed = match trim_type {
+                TrimType::Left => s.trim_start(),
+                TrimType::Right => s.trim_end(),
+                TrimType::Both => s.trim(),
+            };
+            builder.append_value(trimmed);
+        }
+    }
+
+    builder.finish()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::StringArray;
+
+    #[test]
+    fn test_trim() {
+        let input = StringArray::from(vec![
+            Some("  hello  "),
+            Some("world"),
+            Some("  spaces  "),
+            None,
+        ]);
+        let input_ref: ArrayRef = Arc::new(input);
+
+        let result = trim_array(&input_ref, TrimType::Both).unwrap();
+        let result_array = 
result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(result_array.value(0), "hello");
+        assert_eq!(result_array.value(1), "world");
+        assert_eq!(result_array.value(2), "spaces");
+        assert!(result_array.is_null(3));
+    }
+
+    #[test]
+    fn test_ltrim() {
+        let input = StringArray::from(vec![Some("  hello  "), Some("world  
")]);
+        let input_ref: ArrayRef = Arc::new(input);
+
+        let result = trim_array(&input_ref, TrimType::Left).unwrap();
+        let result_array = 
result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(result_array.value(0), "hello  ");
+        assert_eq!(result_array.value(1), "world  ");
+    }
+
+    #[test]
+    fn test_rtrim() {
+        let input = StringArray::from(vec![Some("  hello  "), Some("  
world")]);
+        let input_ref: ArrayRef = Arc::new(input);
+
+        let result = trim_array(&input_ref, TrimType::Right).unwrap();
+        let result_array = 
result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(result_array.value(0), "  hello");
+        assert_eq!(result_array.value(1), "  world");
+    }
+
+    #[test]
+    fn test_trim_no_whitespace_fast_path() {
+        // Test the fast path where no trimming is needed
+        let input = StringArray::from(vec![
+            Some("hello"),
+            Some("world"),
+            Some("no spaces"),
+            None,
+        ]);
+        let input_ref: ArrayRef = Arc::new(input.clone());
+
+        let result = trim_array(&input_ref, TrimType::Both).unwrap();
+        let result_array = 
result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        // Verify values are correct
+        assert_eq!(result_array.value(0), "hello");
+        assert_eq!(result_array.value(1), "world");
+        assert_eq!(result_array.value(2), "no spaces");
+        assert!(result_array.is_null(3));
+    }
+
+    #[test]
+    fn test_ltrim_no_whitespace() {
+        // Test ltrim with strings that have no leading whitespace
+        let input = StringArray::from(vec![Some("hello  "), Some("world")]);
+        let input_ref: ArrayRef = Arc::new(input);
+
+        let result = trim_array(&input_ref, TrimType::Left).unwrap();
+        let result_array = 
result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(result_array.value(0), "hello  ");
+        assert_eq!(result_array.value(1), "world");
+    }
+
+    #[test]
+    fn test_rtrim_no_whitespace() {
+        // Test rtrim with strings that have no trailing whitespace
+        let input = StringArray::from(vec![Some("  hello"), Some("world")]);
+        let input_ref: ArrayRef = Arc::new(input);
+
+        let result = trim_array(&input_ref, TrimType::Right).unwrap();
+        let result_array = 
result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(result_array.value(0), "  hello");
+        assert_eq!(result_array.value(1), "world");
+    }
+}

Review Comment:
   The test coverage is missing several important edge cases:
   1. Empty strings - test how empty string "" is handled (should remain empty)
   2. Strings with only whitespace - test strings like "   " (should become 
empty after trim)
   3. Different types of whitespace - test tabs, newlines, and other Unicode 
whitespace characters
   4. Large string arrays - test performance characteristics with larger data 
sets
   5. LargeUtf8 arrays - only Utf8 arrays are tested, but the code supports both
   6. Scalar value paths - the scalar handling branches in 
spark_trim/ltrim/rtrim are not tested
   
   Consider adding tests for these scenarios to ensure correctness across all 
supported input types and edge cases.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] perf: implement native Rust trim functions for better performance [datafusion-comet]

Reply via email to