Omega359 commented on code in PR #11941:
URL: https://github.com/apache/datafusion/pull/11941#discussion_r1714624067


##########
datafusion/functions/src/unicode/lpad.rs:
##########
@@ -76,300 +87,450 @@ impl ScalarUDFImpl for LPadFunc {
     }
 
     fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        match args[0].data_type() {
-            DataType::Utf8 => make_scalar_function(lpad::<i32>, vec![])(args),
-            DataType::LargeUtf8 => make_scalar_function(lpad::<i64>, 
vec![])(args),
-            other => exec_err!("Unsupported data type {other:?} for function 
lpad"),
-        }
+        make_scalar_function(lpad, vec![])(args)
     }
 }
 
-/// Extends the string to length 'length' by prepending the characters fill (a 
space by default). If the string is already longer than length then it is 
truncated (on the right).
+/// Extends the string to length 'length' by prepending the characters fill (a 
space by default).
+/// If the string is already longer than length then it is truncated (on the 
right).
 /// lpad('hi', 5, 'xy') = 'xyxhi'
-pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args.len() {
-        2 => {
-            let string_array = as_generic_string_array::<T>(&args[0])?;
-            let length_array = as_int64_array(&args[1])?;
-
-            let result = string_array
-                .iter()
-                .zip(length_array.iter())
-                .map(|(string, length)| match (string, length) {
-                    (Some(string), Some(length)) => {
-                        if length > i32::MAX as i64 {
-                            return exec_err!(
-                                "lpad requested length {length} too large"
-                            );
-                        }
+pub fn lpad(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() <= 1 || args.len() > 3 {
+        return exec_err!(
+            "lpad was called with {} arguments. It requires at least 2 and at 
most 3.",
+            args.len()
+        );
+    }
+
+    let length_array = as_int64_array(&args[1])?;
+
+    match args[0].data_type() {
+        Utf8 => match args.len() {
+            2 => lpad_impl::<&GenericStringArray<i32>, 
&GenericStringArray<i32>, i32>(
+                args[0].as_string::<i32>(),
+                length_array,
+                None,
+            ),
+            3 => lpad_with_replace::<&GenericStringArray<i32>, i32>(
+                args[0].as_string::<i32>(),
+                length_array,
+                &args[2],
+            ),
+            _ => unreachable!(),
+        },
+        LargeUtf8 => match args.len() {
+            2 => lpad_impl::<&GenericStringArray<i64>, 
&GenericStringArray<i64>, i64>(
+                args[0].as_string::<i64>(),
+                length_array,
+                None,
+            ),
+            3 => lpad_with_replace::<&GenericStringArray<i64>, i64>(
+                args[0].as_string::<i64>(),
+                length_array,
+                &args[2],
+            ),
+            _ => unreachable!(),
+        },
+        Utf8View => match args.len() {
+            2 => lpad_impl::<&StringViewArray, &GenericStringArray<i32>, i32>(
+                args[0].as_string_view(),
+                length_array,
+                None,
+            ),
+            3 => lpad_with_replace::<&StringViewArray, i32>(
+                args[0].as_string_view(),
+                length_array,
+                &args[2],
+            ),
+            _ => unreachable!(),
+        },
+        other => {
+            exec_err!("Unsupported data type {other:?} for function lpad")
+        }
+    }
+}
 
-                        let length = if length < 0 { 0 } else { length as 
usize };
-                        if length == 0 {
-                            Ok(Some("".to_string()))
+fn lpad_with_replace<'a, V, T: OffsetSizeTrait>(
+    string_array: V,
+    length_array: &Int64Array,
+    fill_array: &'a ArrayRef,
+) -> Result<ArrayRef>
+where
+    V: StringArrayType<'a>,
+{
+    match fill_array.data_type() {
+        Utf8 => lpad_impl::<V, &GenericStringArray<i32>, T>(
+            string_array,
+            length_array,
+            Some(fill_array.as_string::<i32>()),
+        ),
+        LargeUtf8 => lpad_impl::<V, &GenericStringArray<i64>, T>(
+            string_array,
+            length_array,
+            Some(fill_array.as_string::<i64>()),
+        ),
+        Utf8View => lpad_impl::<V, &StringViewArray, T>(
+            string_array,
+            length_array,
+            Some(fill_array.as_string_view()),
+        ),
+        other => {
+            exec_err!("Unsupported data type {other:?} for function lpad")
+        }
+    }
+}
+
+fn lpad_impl<'a, V, V2, T>(
+    string_array: V,
+    length_array: &Int64Array,
+    fill_array: Option<V2>,
+) -> Result<ArrayRef>
+where
+    V: StringArrayType<'a>,
+    V2: StringArrayType<'a>,
+    T: OffsetSizeTrait,
+{
+    if fill_array.is_none() {
+        let result = string_array
+            .iter()
+            .zip(length_array.iter())
+            .map(|(string, length)| match (string, length) {
+                (Some(string), Some(length)) => {
+                    if length > i32::MAX as i64 {
+                        return exec_err!("lpad requested length {length} too 
large");
+                    }
+
+                    let length = if length < 0 { 0 } else { length as usize };
+                    if length == 0 {
+                        Ok(Some("".to_string()))
+                    } else {
+                        let graphemes = 
string.graphemes(true).collect::<Vec<&str>>();
+                        if length < graphemes.len() {
+                            Ok(Some(graphemes[..length].concat()))
                         } else {
-                            let graphemes = 
string.graphemes(true).collect::<Vec<&str>>();
-                            if length < graphemes.len() {
-                                Ok(Some(graphemes[..length].concat()))
-                            } else {
-                                let mut s: String = " ".repeat(length - 
graphemes.len());
-                                s.push_str(string);
-                                Ok(Some(s))
-                            }
+                            let mut s: String = " ".repeat(length - 
graphemes.len());

Review Comment:
   I did a test locally with using `GenericStringBuilder` vs what is in this PR 
and the differences are within +/- 10% for 1024 and 2048 element arrays for 
each of the 3 string types. Worth noting though is that timing between runs on 
my laptop is not really stable at all (+/- 10% or so between runs of the exact 
same code).
   
   Interestingly the utf8view seems to be consistently slightly slower than 
just plain utf8 at least in this function.
   ```
   lpad function/utf8 type/2048
                           time:   [715.78 µs 718.43 µs 721.89 µs]
                           
   lpad function/largeutf8 type/2048
                           time:   [745.44 µs 759.62 µs 779.84 µs]
                          
   lpad function/stringview type/2048
                           time:   [734.25 µs 751.13 µs 773.67 µs]
                          
     ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to