Re: [PR] perf: Optimize initcap() [datafusion]

via GitHub Thu, 19 Feb 2026 04:54:59 -0800


martin-g commented on code in PR #20352:
URL: https://github.com/apache/datafusion/pull/20352#discussion_r2827704011



##########
datafusion/functions/src/unicode/initcap.rs:
##########
@@ -329,4 +385,52 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_initcap_ascii_array() -> Result<()> {
+        let array = StringArray::from(vec![
+            Some("hello world"),
+            None,
+            Some("foo-bar_baz/baX"),
+            Some(""),
+            Some("123 abc 456DEF"),
+            Some("ALL CAPS"),
+            Some("already correct"),
+        ]);
+        let args: Vec<ArrayRef> = vec![Arc::new(array)];
+        let result = super::initcap::<i32>(&args)?;
+        let result = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(result.len(), 7);
+        assert_eq!(result.value(0), "Hello World");
+        assert!(result.is_null(1));
+        assert_eq!(result.value(2), "Foo-Bar_Baz/Bax");
+        assert_eq!(result.value(3), "");
+        assert_eq!(result.value(4), "123 Abc 456def");
+        assert_eq!(result.value(5), "All Caps");
+        assert_eq!(result.value(6), "Already Correct");
+        Ok(())
+    }
+
+    /// Test that initcap works correctly on a sliced ASCII StringArray.
+    #[test]
+    fn test_initcap_sliced_ascii_array() -> Result<()> {
+        let array = StringArray::from(vec![
+            Some("hello world"),
+            Some("foo bar"),
+            Some("baz qux"),
+        ]);
+        // Slice to get only the last two elements. The resulting array's
+        // offsets are [11, 18, 25] (non-zero start), but value_data still
+        // contains the full original buffer.
+        let sliced = array.slice(1, 2);
+        let args: Vec<ArrayRef> = vec![Arc::new(sliced)];
+        let result = super::initcap::<i32>(&args)?;
+        let result = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(result.len(), 2);
+        assert_eq!(result.value(0), "Foo Bar");
+        assert_eq!(result.value(1), "Baz Qux");
+        Ok(())
+    }

Review Comment:
   Both new unit tests use StringArray (Utf8). It would be good to have a test 
for LargeStringArray (LargeUtf8) too. Either change of the existing tests or 
add a new one.



##########
datafusion/functions/src/unicode/initcap.rs:
##########
@@ -198,13 +250,16 @@ fn initcap_string(input: &str, container: &mut String) {
     let mut prev_is_alphanumeric = false;
 
     if input.is_ascii() {
-        for c in input.chars() {
+        container.reserve(input.len());
+        // SAFETY: each byte is ASCII, so the result is valid UTF-8.
+        let out = unsafe { container.as_mut_vec() };
+        for &b in input.as_bytes() {
             if prev_is_alphanumeric {
-                container.push(c.to_ascii_lowercase());
+                out.push(b.to_ascii_lowercase());
             } else {
-                container.push(c.to_ascii_uppercase());
-            };
-            prev_is_alphanumeric = c.is_ascii_alphanumeric();
+                out.push(b.to_ascii_uppercase());
+            }
+            prev_is_alphanumeric = b.is_ascii_alphanumeric();
         }

Review Comment:
   The String::len should be updated after the "manual" update of the inner vec.
   ```suggestion
           }
           unsafe {
               container.set_len(input.len());
           }
   ```



##########
datafusion/functions/src/unicode/initcap.rs:
##########
@@ -148,8 +150,8 @@ impl ScalarUDFImpl for InitcapFunc {
     }
 }
 
-/// Converts the first letter of each word to upper case and the rest to lower
-/// case. Words are sequences of alphanumeric characters separated by
+/// Converts the first letter of each word to uppercase and the rest to
+/// lowercase.  Words are sequences of alphanumeric characters separated by

Review Comment:
   ```suggestion
   /// lowercase. Words are sequences of alphanumeric characters separated by
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] perf: Optimize initcap() [datafusion]

Reply via email to