martin-g commented on code in PR #20352:
URL: https://github.com/apache/datafusion/pull/20352#discussion_r2827704011
##########
datafusion/functions/src/unicode/initcap.rs:
##########
@@ -329,4 +385,52 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn test_initcap_ascii_array() -> Result<()> {
+ let array = StringArray::from(vec![
+ Some("hello world"),
+ None,
+ Some("foo-bar_baz/baX"),
+ Some(""),
+ Some("123 abc 456DEF"),
+ Some("ALL CAPS"),
+ Some("already correct"),
+ ]);
+ let args: Vec<ArrayRef> = vec![Arc::new(array)];
+ let result = super::initcap::<i32>(&args)?;
+ let result = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+ assert_eq!(result.len(), 7);
+ assert_eq!(result.value(0), "Hello World");
+ assert!(result.is_null(1));
+ assert_eq!(result.value(2), "Foo-Bar_Baz/Bax");
+ assert_eq!(result.value(3), "");
+ assert_eq!(result.value(4), "123 Abc 456def");
+ assert_eq!(result.value(5), "All Caps");
+ assert_eq!(result.value(6), "Already Correct");
+ Ok(())
+ }
+
+ /// Test that initcap works correctly on a sliced ASCII StringArray.
+ #[test]
+ fn test_initcap_sliced_ascii_array() -> Result<()> {
+ let array = StringArray::from(vec![
+ Some("hello world"),
+ Some("foo bar"),
+ Some("baz qux"),
+ ]);
+ // Slice to get only the last two elements. The resulting array's
+ // offsets are [11, 18, 25] (non-zero start), but value_data still
+ // contains the full original buffer.
+ let sliced = array.slice(1, 2);
+ let args: Vec<ArrayRef> = vec![Arc::new(sliced)];
+ let result = super::initcap::<i32>(&args)?;
+ let result = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+ assert_eq!(result.len(), 2);
+ assert_eq!(result.value(0), "Foo Bar");
+ assert_eq!(result.value(1), "Baz Qux");
+ Ok(())
+ }
Review Comment:
Both new unit tests use StringArray (Utf8). It would be good to have a test
for LargeStringArray (LargeUtf8) too. Either change of the existing tests or
add a new one.
##########
datafusion/functions/src/unicode/initcap.rs:
##########
@@ -198,13 +250,16 @@ fn initcap_string(input: &str, container: &mut String) {
let mut prev_is_alphanumeric = false;
if input.is_ascii() {
- for c in input.chars() {
+ container.reserve(input.len());
+ // SAFETY: each byte is ASCII, so the result is valid UTF-8.
+ let out = unsafe { container.as_mut_vec() };
+ for &b in input.as_bytes() {
if prev_is_alphanumeric {
- container.push(c.to_ascii_lowercase());
+ out.push(b.to_ascii_lowercase());
} else {
- container.push(c.to_ascii_uppercase());
- };
- prev_is_alphanumeric = c.is_ascii_alphanumeric();
+ out.push(b.to_ascii_uppercase());
+ }
+ prev_is_alphanumeric = b.is_ascii_alphanumeric();
}
Review Comment:
The String::len should be updated after the "manual" update of the inner vec.
```suggestion
}
unsafe {
container.set_len(input.len());
}
```
##########
datafusion/functions/src/unicode/initcap.rs:
##########
@@ -148,8 +150,8 @@ impl ScalarUDFImpl for InitcapFunc {
}
}
-/// Converts the first letter of each word to upper case and the rest to lower
-/// case. Words are sequences of alphanumeric characters separated by
+/// Converts the first letter of each word to uppercase and the rest to
+/// lowercase. Words are sequences of alphanumeric characters separated by
Review Comment:
```suggestion
/// lowercase. Words are sequences of alphanumeric characters separated by
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]