alamb commented on code in PR #11941:
URL: https://github.com/apache/datafusion/pull/11941#discussion_r1713604384


##########
datafusion/functions/src/unicode/lpad.rs:
##########
@@ -76,300 +87,450 @@ impl ScalarUDFImpl for LPadFunc {
     }
 
     fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        match args[0].data_type() {
-            DataType::Utf8 => make_scalar_function(lpad::<i32>, vec![])(args),
-            DataType::LargeUtf8 => make_scalar_function(lpad::<i64>, 
vec![])(args),
-            other => exec_err!("Unsupported data type {other:?} for function 
lpad"),
-        }
+        make_scalar_function(lpad, vec![])(args)
     }
 }
 
-/// Extends the string to length 'length' by prepending the characters fill (a 
space by default). If the string is already longer than length then it is 
truncated (on the right).
+/// Extends the string to length 'length' by prepending the characters fill (a 
space by default).
+/// If the string is already longer than length then it is truncated (on the 
right).
 /// lpad('hi', 5, 'xy') = 'xyxhi'
-pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args.len() {
-        2 => {
-            let string_array = as_generic_string_array::<T>(&args[0])?;
-            let length_array = as_int64_array(&args[1])?;
-
-            let result = string_array
-                .iter()
-                .zip(length_array.iter())
-                .map(|(string, length)| match (string, length) {
-                    (Some(string), Some(length)) => {
-                        if length > i32::MAX as i64 {
-                            return exec_err!(
-                                "lpad requested length {length} too large"
-                            );
-                        }
+pub fn lpad(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() <= 1 || args.len() > 3 {
+        return exec_err!(
+            "lpad was called with {} arguments. It requires at least 2 and at 
most 3.",
+            args.len()
+        );
+    }
+
+    let length_array = as_int64_array(&args[1])?;
+
+    match args[0].data_type() {
+        Utf8 => match args.len() {
+            2 => lpad_impl::<&GenericStringArray<i32>, 
&GenericStringArray<i32>, i32>(
+                args[0].as_string::<i32>(),
+                length_array,
+                None,
+            ),
+            3 => lpad_with_replace::<&GenericStringArray<i32>, i32>(
+                args[0].as_string::<i32>(),
+                length_array,
+                &args[2],
+            ),
+            _ => unreachable!(),
+        },
+        LargeUtf8 => match args.len() {
+            2 => lpad_impl::<&GenericStringArray<i64>, 
&GenericStringArray<i64>, i64>(
+                args[0].as_string::<i64>(),
+                length_array,
+                None,
+            ),
+            3 => lpad_with_replace::<&GenericStringArray<i64>, i64>(
+                args[0].as_string::<i64>(),
+                length_array,
+                &args[2],
+            ),
+            _ => unreachable!(),
+        },
+        Utf8View => match args.len() {
+            2 => lpad_impl::<&StringViewArray, &GenericStringArray<i32>, i32>(
+                args[0].as_string_view(),
+                length_array,
+                None,
+            ),
+            3 => lpad_with_replace::<&StringViewArray, i32>(
+                args[0].as_string_view(),
+                length_array,
+                &args[2],
+            ),
+            _ => unreachable!(),
+        },
+        other => {
+            exec_err!("Unsupported data type {other:?} for function lpad")
+        }
+    }
+}
 
-                        let length = if length < 0 { 0 } else { length as 
usize };
-                        if length == 0 {
-                            Ok(Some("".to_string()))
+fn lpad_with_replace<'a, V, T: OffsetSizeTrait>(
+    string_array: V,
+    length_array: &Int64Array,
+    fill_array: &'a ArrayRef,
+) -> Result<ArrayRef>
+where
+    V: StringArrayType<'a>,
+{
+    match fill_array.data_type() {
+        Utf8 => lpad_impl::<V, &GenericStringArray<i32>, T>(
+            string_array,
+            length_array,
+            Some(fill_array.as_string::<i32>()),
+        ),
+        LargeUtf8 => lpad_impl::<V, &GenericStringArray<i64>, T>(
+            string_array,
+            length_array,
+            Some(fill_array.as_string::<i64>()),
+        ),
+        Utf8View => lpad_impl::<V, &StringViewArray, T>(
+            string_array,
+            length_array,
+            Some(fill_array.as_string_view()),
+        ),
+        other => {
+            exec_err!("Unsupported data type {other:?} for function lpad")
+        }
+    }
+}
+
+fn lpad_impl<'a, V, V2, T>(
+    string_array: V,
+    length_array: &Int64Array,
+    fill_array: Option<V2>,
+) -> Result<ArrayRef>
+where
+    V: StringArrayType<'a>,
+    V2: StringArrayType<'a>,
+    T: OffsetSizeTrait,
+{
+    if fill_array.is_none() {
+        let result = string_array
+            .iter()
+            .zip(length_array.iter())
+            .map(|(string, length)| match (string, length) {
+                (Some(string), Some(length)) => {
+                    if length > i32::MAX as i64 {
+                        return exec_err!("lpad requested length {length} too 
large");
+                    }
+
+                    let length = if length < 0 { 0 } else { length as usize };
+                    if length == 0 {
+                        Ok(Some("".to_string()))
+                    } else {
+                        let graphemes = 
string.graphemes(true).collect::<Vec<&str>>();
+                        if length < graphemes.len() {
+                            Ok(Some(graphemes[..length].concat()))
                         } else {
-                            let graphemes = 
string.graphemes(true).collect::<Vec<&str>>();
-                            if length < graphemes.len() {
-                                Ok(Some(graphemes[..length].concat()))
-                            } else {
-                                let mut s: String = " ".repeat(length - 
graphemes.len());
-                                s.push_str(string);
-                                Ok(Some(s))
-                            }
+                            let mut s: String = " ".repeat(length - 
graphemes.len());

Review Comment:
   This PR just follows the existing pattern so I don't think any changes are 
needed, but I think using a `GenericStringBuilder` would be much faster here 
(as it wouldn't allocate a string for each output array element)
   
   



##########
datafusion/functions/src/unicode/lpad.rs:
##########
@@ -76,300 +87,450 @@ impl ScalarUDFImpl for LPadFunc {
     }
 
     fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        match args[0].data_type() {
-            DataType::Utf8 => make_scalar_function(lpad::<i32>, vec![])(args),
-            DataType::LargeUtf8 => make_scalar_function(lpad::<i64>, 
vec![])(args),
-            other => exec_err!("Unsupported data type {other:?} for function 
lpad"),
-        }
+        make_scalar_function(lpad, vec![])(args)
     }
 }
 
-/// Extends the string to length 'length' by prepending the characters fill (a 
space by default). If the string is already longer than length then it is 
truncated (on the right).
+/// Extends the string to length 'length' by prepending the characters fill (a 
space by default).
+/// If the string is already longer than length then it is truncated (on the 
right).
 /// lpad('hi', 5, 'xy') = 'xyxhi'
-pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args.len() {
-        2 => {
-            let string_array = as_generic_string_array::<T>(&args[0])?;
-            let length_array = as_int64_array(&args[1])?;
-
-            let result = string_array
-                .iter()
-                .zip(length_array.iter())
-                .map(|(string, length)| match (string, length) {
-                    (Some(string), Some(length)) => {
-                        if length > i32::MAX as i64 {
-                            return exec_err!(
-                                "lpad requested length {length} too large"
-                            );
-                        }
+pub fn lpad(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() <= 1 || args.len() > 3 {
+        return exec_err!(
+            "lpad was called with {} arguments. It requires at least 2 and at 
most 3.",
+            args.len()
+        );
+    }
+
+    let length_array = as_int64_array(&args[1])?;
+
+    match args[0].data_type() {
+        Utf8 => match args.len() {
+            2 => lpad_impl::<&GenericStringArray<i32>, 
&GenericStringArray<i32>, i32>(
+                args[0].as_string::<i32>(),
+                length_array,
+                None,
+            ),
+            3 => lpad_with_replace::<&GenericStringArray<i32>, i32>(
+                args[0].as_string::<i32>(),
+                length_array,
+                &args[2],
+            ),
+            _ => unreachable!(),
+        },
+        LargeUtf8 => match args.len() {
+            2 => lpad_impl::<&GenericStringArray<i64>, 
&GenericStringArray<i64>, i64>(
+                args[0].as_string::<i64>(),
+                length_array,
+                None,
+            ),
+            3 => lpad_with_replace::<&GenericStringArray<i64>, i64>(
+                args[0].as_string::<i64>(),
+                length_array,
+                &args[2],
+            ),
+            _ => unreachable!(),
+        },
+        Utf8View => match args.len() {
+            2 => lpad_impl::<&StringViewArray, &GenericStringArray<i32>, i32>(
+                args[0].as_string_view(),
+                length_array,
+                None,
+            ),
+            3 => lpad_with_replace::<&StringViewArray, i32>(
+                args[0].as_string_view(),
+                length_array,
+                &args[2],
+            ),
+            _ => unreachable!(),
+        },
+        other => {
+            exec_err!("Unsupported data type {other:?} for function lpad")
+        }
+    }
+}
 
-                        let length = if length < 0 { 0 } else { length as 
usize };
-                        if length == 0 {
-                            Ok(Some("".to_string()))
+fn lpad_with_replace<'a, V, T: OffsetSizeTrait>(
+    string_array: V,
+    length_array: &Int64Array,
+    fill_array: &'a ArrayRef,
+) -> Result<ArrayRef>
+where
+    V: StringArrayType<'a>,
+{
+    match fill_array.data_type() {
+        Utf8 => lpad_impl::<V, &GenericStringArray<i32>, T>(
+            string_array,
+            length_array,
+            Some(fill_array.as_string::<i32>()),
+        ),
+        LargeUtf8 => lpad_impl::<V, &GenericStringArray<i64>, T>(
+            string_array,
+            length_array,
+            Some(fill_array.as_string::<i64>()),
+        ),
+        Utf8View => lpad_impl::<V, &StringViewArray, T>(
+            string_array,
+            length_array,
+            Some(fill_array.as_string_view()),
+        ),
+        other => {
+            exec_err!("Unsupported data type {other:?} for function lpad")
+        }
+    }
+}
+
+fn lpad_impl<'a, V, V2, T>(
+    string_array: V,
+    length_array: &Int64Array,
+    fill_array: Option<V2>,
+) -> Result<ArrayRef>
+where
+    V: StringArrayType<'a>,
+    V2: StringArrayType<'a>,
+    T: OffsetSizeTrait,
+{
+    if fill_array.is_none() {
+        let result = string_array
+            .iter()
+            .zip(length_array.iter())
+            .map(|(string, length)| match (string, length) {
+                (Some(string), Some(length)) => {
+                    if length > i32::MAX as i64 {
+                        return exec_err!("lpad requested length {length} too 
large");
+                    }
+
+                    let length = if length < 0 { 0 } else { length as usize };
+                    if length == 0 {
+                        Ok(Some("".to_string()))
+                    } else {
+                        let graphemes = 
string.graphemes(true).collect::<Vec<&str>>();
+                        if length < graphemes.len() {
+                            Ok(Some(graphemes[..length].concat()))
                         } else {
-                            let graphemes = 
string.graphemes(true).collect::<Vec<&str>>();
-                            if length < graphemes.len() {
-                                Ok(Some(graphemes[..length].concat()))
-                            } else {
-                                let mut s: String = " ".repeat(length - 
graphemes.len());
-                                s.push_str(string);
-                                Ok(Some(s))
-                            }
+                            let mut s: String = " ".repeat(length - 
graphemes.len());
+                            s.push_str(string);
+                            Ok(Some(s))
                         }
                     }
-                    _ => Ok(None),
-                })
-                .collect::<Result<GenericStringArray<T>>>()?;
+                }
+                _ => Ok(None),
+            })
+            .collect::<Result<GenericStringArray<T>>>()?;
 
-            Ok(Arc::new(result) as ArrayRef)
-        }
-        3 => {
-            let string_array = as_generic_string_array::<T>(&args[0])?;
-            let length_array = as_int64_array(&args[1])?;
-            let fill_array = as_generic_string_array::<T>(&args[2])?;
-
-            let result = string_array
-                .iter()
-                .zip(length_array.iter())
-                .zip(fill_array.iter())
-                .map(|((string, length), fill)| match (string, length, fill) {
-                    (Some(string), Some(length), Some(fill)) => {
-                        if length > i32::MAX as i64 {
-                            return exec_err!(
-                                "lpad requested length {length} too large"
-                            );
-                        }
+        Ok(Arc::new(result) as ArrayRef)
+    } else {
+        let result = string_array
+            .iter()
+            .zip(length_array.iter())
+            .zip(fill_array.unwrap().iter())
+            .map(|((string, length), fill)| match (string, length, fill) {
+                (Some(string), Some(length), Some(fill)) => {
+                    if length > i32::MAX as i64 {
+                        return exec_err!("lpad requested length {length} too 
large");
+                    }
+
+                    let length = if length < 0 { 0 } else { length as usize };
+                    if length == 0 {
+                        Ok(Some("".to_string()))
+                    } else {
+                        let graphemes = 
string.graphemes(true).collect::<Vec<&str>>();
+                        let fill_chars = fill.chars().collect::<Vec<char>>();
 
-                        let length = if length < 0 { 0 } else { length as 
usize };
-                        if length == 0 {
-                            Ok(Some("".to_string()))
+                        if length < graphemes.len() {

Review Comment:
   likewise here I think GenericStringBuilder would likely be faster



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to