neilconway commented on code in PR #20657:
URL: https://github.com/apache/datafusion/pull/20657#discussion_r2961239320


##########
datafusion/functions/src/unicode/lpad.rs:
##########
@@ -129,8 +180,126 @@ impl ScalarUDFImpl for LPadFunc {
     }
 }
 
-/// Extends the string to length 'length' by prepending the characters fill (a 
space by default).
-/// If the string is already longer than length then it is truncated (on the 
right).
+use super::common::{try_as_scalar_i64, try_as_scalar_str};
+
+/// Optimized lpad for constant target_len and fill arguments.
+fn lpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
+    string_array: V,
+    target_len: usize,
+    fill: &str,
+) -> Result<ArrayRef> {
+    if string_array.is_ascii() && fill.is_ascii() {
+        lpad_scalar_ascii::<V, T>(string_array, target_len, fill)
+    } else {
+        lpad_scalar_unicode::<V, T>(string_array, target_len, fill)
+    }
+}
+
+fn lpad_scalar_ascii<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
+    string_array: V,
+    target_len: usize,
+    fill: &str,
+) -> Result<ArrayRef> {
+    // With a scalar `target_len` and `fill`, we can precompute a padding
+    // buffer of `target_len` fill characters repeated cyclically.
+    let padding_buf = if !fill.is_empty() {
+        let mut buf = String::with_capacity(target_len);
+        while buf.len() < target_len {
+            let remaining = target_len - buf.len();
+            if remaining >= fill.len() {
+                buf.push_str(fill);
+            } else {
+                buf.push_str(&fill[..remaining]);
+            }
+        }
+        buf
+    } else {
+        String::new()
+    };
+
+    // Each output row is exactly `target_len` ASCII bytes (padding + string).
+    let data_capacity = string_array.len().saturating_mul(target_len);
+    let mut builder =
+        GenericStringBuilder::<T>::with_capacity(string_array.len(), 
data_capacity);
+
+    for maybe_string in string_array.iter() {
+        match maybe_string {
+            Some(string) => {
+                let str_len = string.len();
+                if target_len <= str_len {
+                    builder.append_value(&string[..target_len]);
+                } else if fill.is_empty() {
+                    builder.append_value(string);
+                } else {
+                    let pad_needed = target_len - str_len;
+                    builder.write_str(&padding_buf[..pad_needed])?;
+                    builder.append_value(string);
+                }
+            }
+            None => builder.append_null(),
+        }
+    }
+
+    Ok(Arc::new(builder.finish()) as ArrayRef)
+}
+
+fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
+    string_array: V,
+    target_len: usize,
+    fill: &str,
+) -> Result<ArrayRef> {
+    let fill_chars: Vec<char> = fill.chars().collect();
+
+    // With a scalar `target_len` and `fill`, we can precompute a padding 
buffer
+    // of `target_len` fill characters repeated cyclically. Because Unicode
+    // characters are variable-width, we build a byte-offset table to map from
+    // character count to the corresponding byte position in the padding 
buffer.
+    let (padding_buf, char_byte_offsets) = if !fill_chars.is_empty() {
+        let mut buf = String::new();
+        let mut offsets = Vec::with_capacity(target_len + 1);
+        offsets.push(0usize);
+        for i in 0..target_len {
+            buf.push(fill_chars[i % fill_chars.len()]);
+            offsets.push(buf.len());
+        }
+        (buf, offsets)
+    } else {
+        (String::new(), vec![0])
+    };
+
+    // Each output row is `target_len` chars; multiply by 4 (max UTF-8 bytes
+    // per char) for an upper bound in bytes.
+    let data_capacity = string_array.len().saturating_mul(target_len * 4);

Review Comment:
   `target_len` is <= 16KB, so `* 4` can't overflow. Switching to 
`saturating_mul` would be more defensive but also misleading; I'm inclined to 
leave it as-is.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to