neilconway commented on code in PR #20657:
URL: https://github.com/apache/datafusion/pull/20657#discussion_r2961416940
##########
datafusion/functions/src/unicode/lpad.rs:
##########
@@ -129,8 +180,126 @@ impl ScalarUDFImpl for LPadFunc {
}
}
-/// Extends the string to length 'length' by prepending the characters fill (a
space by default).
-/// If the string is already longer than length then it is truncated (on the
right).
+use super::common::{try_as_scalar_i64, try_as_scalar_str};
+
+/// Optimized lpad for constant target_len and fill arguments.
+fn lpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
+ string_array: V,
+ target_len: usize,
+ fill: &str,
+) -> Result<ArrayRef> {
+ if string_array.is_ascii() && fill.is_ascii() {
+ lpad_scalar_ascii::<V, T>(string_array, target_len, fill)
+ } else {
+ lpad_scalar_unicode::<V, T>(string_array, target_len, fill)
+ }
+}
+
+fn lpad_scalar_ascii<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
+ string_array: V,
+ target_len: usize,
+ fill: &str,
+) -> Result<ArrayRef> {
+ // With a scalar `target_len` and `fill`, we can precompute a padding
+ // buffer of `target_len` fill characters repeated cyclically.
+ let padding_buf = if !fill.is_empty() {
+ let mut buf = String::with_capacity(target_len);
+ while buf.len() < target_len {
+ let remaining = target_len - buf.len();
+ if remaining >= fill.len() {
+ buf.push_str(fill);
+ } else {
+ buf.push_str(&fill[..remaining]);
+ }
+ }
+ buf
+ } else {
+ String::new()
+ };
+
+ // Each output row is exactly `target_len` ASCII bytes (padding + string).
+ let data_capacity = string_array.len().saturating_mul(target_len);
+ let mut builder =
+ GenericStringBuilder::<T>::with_capacity(string_array.len(),
data_capacity);
+
+ for maybe_string in string_array.iter() {
+ match maybe_string {
+ Some(string) => {
+ let str_len = string.len();
+ if target_len <= str_len {
+ builder.append_value(&string[..target_len]);
+ } else if fill.is_empty() {
+ builder.append_value(string);
+ } else {
+ let pad_needed = target_len - str_len;
+ builder.write_str(&padding_buf[..pad_needed])?;
+ builder.append_value(string);
+ }
+ }
+ None => builder.append_null(),
+ }
+ }
+
+ Ok(Arc::new(builder.finish()) as ArrayRef)
+}
+
+fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
+ string_array: V,
+ target_len: usize,
+ fill: &str,
+) -> Result<ArrayRef> {
+ let fill_chars: Vec<char> = fill.chars().collect();
Review Comment:
Interesting point. Looking into this, most SQL systems use only codepoints,
not graphemes. Indeed, most places in DataFusion were switched to use
codepoints rather than graphemes in #3054. That PR didn't change
lpad/rpad/translate to use codepoints, but I can't see a good reason why not;
the Postgres implementation of lpad/rpad uses codepoints, for example.
I filed #21060 to switch `lpad`, `rpad`, and `translate` to use codepoints,
but I think that's out of scope for this PR.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]