alamb commented on code in PR #11941: URL: https://github.com/apache/datafusion/pull/11941#discussion_r1713604384
########## datafusion/functions/src/unicode/lpad.rs: ########## @@ -76,300 +87,450 @@ impl ScalarUDFImpl for LPadFunc { } fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> { - match args[0].data_type() { - DataType::Utf8 => make_scalar_function(lpad::<i32>, vec![])(args), - DataType::LargeUtf8 => make_scalar_function(lpad::<i64>, vec![])(args), - other => exec_err!("Unsupported data type {other:?} for function lpad"), - } + make_scalar_function(lpad, vec![])(args) } } -/// Extends the string to length 'length' by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right). +/// Extends the string to length 'length' by prepending the characters fill (a space by default). +/// If the string is already longer than length then it is truncated (on the right). /// lpad('hi', 5, 'xy') = 'xyxhi' -pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> { - match args.len() { - 2 => { - let string_array = as_generic_string_array::<T>(&args[0])?; - let length_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(length_array.iter()) - .map(|(string, length)| match (string, length) { - (Some(string), Some(length)) => { - if length > i32::MAX as i64 { - return exec_err!( - "lpad requested length {length} too large" - ); - } +pub fn lpad(args: &[ArrayRef]) -> Result<ArrayRef> { + if args.len() <= 1 || args.len() > 3 { + return exec_err!( + "lpad was called with {} arguments. It requires at least 2 and at most 3.", + args.len() + ); + } + + let length_array = as_int64_array(&args[1])?; + + match args[0].data_type() { + Utf8 => match args.len() { + 2 => lpad_impl::<&GenericStringArray<i32>, &GenericStringArray<i32>, i32>( + args[0].as_string::<i32>(), + length_array, + None, + ), + 3 => lpad_with_replace::<&GenericStringArray<i32>, i32>( + args[0].as_string::<i32>(), + length_array, + &args[2], + ), + _ => unreachable!(), + }, + LargeUtf8 => match args.len() { + 2 => lpad_impl::<&GenericStringArray<i64>, &GenericStringArray<i64>, i64>( + args[0].as_string::<i64>(), + length_array, + None, + ), + 3 => lpad_with_replace::<&GenericStringArray<i64>, i64>( + args[0].as_string::<i64>(), + length_array, + &args[2], + ), + _ => unreachable!(), + }, + Utf8View => match args.len() { + 2 => lpad_impl::<&StringViewArray, &GenericStringArray<i32>, i32>( + args[0].as_string_view(), + length_array, + None, + ), + 3 => lpad_with_replace::<&StringViewArray, i32>( + args[0].as_string_view(), + length_array, + &args[2], + ), + _ => unreachable!(), + }, + other => { + exec_err!("Unsupported data type {other:?} for function lpad") + } + } +} - let length = if length < 0 { 0 } else { length as usize }; - if length == 0 { - Ok(Some("".to_string())) +fn lpad_with_replace<'a, V, T: OffsetSizeTrait>( + string_array: V, + length_array: &Int64Array, + fill_array: &'a ArrayRef, +) -> Result<ArrayRef> +where + V: StringArrayType<'a>, +{ + match fill_array.data_type() { + Utf8 => lpad_impl::<V, &GenericStringArray<i32>, T>( + string_array, + length_array, + Some(fill_array.as_string::<i32>()), + ), + LargeUtf8 => lpad_impl::<V, &GenericStringArray<i64>, T>( + string_array, + length_array, + Some(fill_array.as_string::<i64>()), + ), + Utf8View => lpad_impl::<V, &StringViewArray, T>( + string_array, + length_array, + Some(fill_array.as_string_view()), + ), + other => { + exec_err!("Unsupported data type {other:?} for function lpad") + } + } +} + +fn lpad_impl<'a, V, V2, T>( + string_array: V, + length_array: &Int64Array, + fill_array: Option<V2>, +) -> Result<ArrayRef> +where + V: StringArrayType<'a>, + V2: StringArrayType<'a>, + T: OffsetSizeTrait, +{ + if fill_array.is_none() { + let result = string_array + .iter() + .zip(length_array.iter()) + .map(|(string, length)| match (string, length) { + (Some(string), Some(length)) => { + if length > i32::MAX as i64 { + return exec_err!("lpad requested length {length} too large"); + } + + let length = if length < 0 { 0 } else { length as usize }; + if length == 0 { + Ok(Some("".to_string())) + } else { + let graphemes = string.graphemes(true).collect::<Vec<&str>>(); + if length < graphemes.len() { + Ok(Some(graphemes[..length].concat())) } else { - let graphemes = string.graphemes(true).collect::<Vec<&str>>(); - if length < graphemes.len() { - Ok(Some(graphemes[..length].concat())) - } else { - let mut s: String = " ".repeat(length - graphemes.len()); - s.push_str(string); - Ok(Some(s)) - } + let mut s: String = " ".repeat(length - graphemes.len()); Review Comment: This PR just follows the existing pattern so I don't think any changes are needed, but I think using a `GenericStringBuilder` would be much faster here (as it wouldn't allocate a string for each output array element) ########## datafusion/functions/src/unicode/lpad.rs: ########## @@ -76,300 +87,450 @@ impl ScalarUDFImpl for LPadFunc { } fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> { - match args[0].data_type() { - DataType::Utf8 => make_scalar_function(lpad::<i32>, vec![])(args), - DataType::LargeUtf8 => make_scalar_function(lpad::<i64>, vec![])(args), - other => exec_err!("Unsupported data type {other:?} for function lpad"), - } + make_scalar_function(lpad, vec![])(args) } } -/// Extends the string to length 'length' by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right). +/// Extends the string to length 'length' by prepending the characters fill (a space by default). +/// If the string is already longer than length then it is truncated (on the right). /// lpad('hi', 5, 'xy') = 'xyxhi' -pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> { - match args.len() { - 2 => { - let string_array = as_generic_string_array::<T>(&args[0])?; - let length_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(length_array.iter()) - .map(|(string, length)| match (string, length) { - (Some(string), Some(length)) => { - if length > i32::MAX as i64 { - return exec_err!( - "lpad requested length {length} too large" - ); - } +pub fn lpad(args: &[ArrayRef]) -> Result<ArrayRef> { + if args.len() <= 1 || args.len() > 3 { + return exec_err!( + "lpad was called with {} arguments. It requires at least 2 and at most 3.", + args.len() + ); + } + + let length_array = as_int64_array(&args[1])?; + + match args[0].data_type() { + Utf8 => match args.len() { + 2 => lpad_impl::<&GenericStringArray<i32>, &GenericStringArray<i32>, i32>( + args[0].as_string::<i32>(), + length_array, + None, + ), + 3 => lpad_with_replace::<&GenericStringArray<i32>, i32>( + args[0].as_string::<i32>(), + length_array, + &args[2], + ), + _ => unreachable!(), + }, + LargeUtf8 => match args.len() { + 2 => lpad_impl::<&GenericStringArray<i64>, &GenericStringArray<i64>, i64>( + args[0].as_string::<i64>(), + length_array, + None, + ), + 3 => lpad_with_replace::<&GenericStringArray<i64>, i64>( + args[0].as_string::<i64>(), + length_array, + &args[2], + ), + _ => unreachable!(), + }, + Utf8View => match args.len() { + 2 => lpad_impl::<&StringViewArray, &GenericStringArray<i32>, i32>( + args[0].as_string_view(), + length_array, + None, + ), + 3 => lpad_with_replace::<&StringViewArray, i32>( + args[0].as_string_view(), + length_array, + &args[2], + ), + _ => unreachable!(), + }, + other => { + exec_err!("Unsupported data type {other:?} for function lpad") + } + } +} - let length = if length < 0 { 0 } else { length as usize }; - if length == 0 { - Ok(Some("".to_string())) +fn lpad_with_replace<'a, V, T: OffsetSizeTrait>( + string_array: V, + length_array: &Int64Array, + fill_array: &'a ArrayRef, +) -> Result<ArrayRef> +where + V: StringArrayType<'a>, +{ + match fill_array.data_type() { + Utf8 => lpad_impl::<V, &GenericStringArray<i32>, T>( + string_array, + length_array, + Some(fill_array.as_string::<i32>()), + ), + LargeUtf8 => lpad_impl::<V, &GenericStringArray<i64>, T>( + string_array, + length_array, + Some(fill_array.as_string::<i64>()), + ), + Utf8View => lpad_impl::<V, &StringViewArray, T>( + string_array, + length_array, + Some(fill_array.as_string_view()), + ), + other => { + exec_err!("Unsupported data type {other:?} for function lpad") + } + } +} + +fn lpad_impl<'a, V, V2, T>( + string_array: V, + length_array: &Int64Array, + fill_array: Option<V2>, +) -> Result<ArrayRef> +where + V: StringArrayType<'a>, + V2: StringArrayType<'a>, + T: OffsetSizeTrait, +{ + if fill_array.is_none() { + let result = string_array + .iter() + .zip(length_array.iter()) + .map(|(string, length)| match (string, length) { + (Some(string), Some(length)) => { + if length > i32::MAX as i64 { + return exec_err!("lpad requested length {length} too large"); + } + + let length = if length < 0 { 0 } else { length as usize }; + if length == 0 { + Ok(Some("".to_string())) + } else { + let graphemes = string.graphemes(true).collect::<Vec<&str>>(); + if length < graphemes.len() { + Ok(Some(graphemes[..length].concat())) } else { - let graphemes = string.graphemes(true).collect::<Vec<&str>>(); - if length < graphemes.len() { - Ok(Some(graphemes[..length].concat())) - } else { - let mut s: String = " ".repeat(length - graphemes.len()); - s.push_str(string); - Ok(Some(s)) - } + let mut s: String = " ".repeat(length - graphemes.len()); + s.push_str(string); + Ok(Some(s)) } } - _ => Ok(None), - }) - .collect::<Result<GenericStringArray<T>>>()?; + } + _ => Ok(None), + }) + .collect::<Result<GenericStringArray<T>>>()?; - Ok(Arc::new(result) as ArrayRef) - } - 3 => { - let string_array = as_generic_string_array::<T>(&args[0])?; - let length_array = as_int64_array(&args[1])?; - let fill_array = as_generic_string_array::<T>(&args[2])?; - - let result = string_array - .iter() - .zip(length_array.iter()) - .zip(fill_array.iter()) - .map(|((string, length), fill)| match (string, length, fill) { - (Some(string), Some(length), Some(fill)) => { - if length > i32::MAX as i64 { - return exec_err!( - "lpad requested length {length} too large" - ); - } + Ok(Arc::new(result) as ArrayRef) + } else { + let result = string_array + .iter() + .zip(length_array.iter()) + .zip(fill_array.unwrap().iter()) + .map(|((string, length), fill)| match (string, length, fill) { + (Some(string), Some(length), Some(fill)) => { + if length > i32::MAX as i64 { + return exec_err!("lpad requested length {length} too large"); + } + + let length = if length < 0 { 0 } else { length as usize }; + if length == 0 { + Ok(Some("".to_string())) + } else { + let graphemes = string.graphemes(true).collect::<Vec<&str>>(); + let fill_chars = fill.chars().collect::<Vec<char>>(); - let length = if length < 0 { 0 } else { length as usize }; - if length == 0 { - Ok(Some("".to_string())) + if length < graphemes.len() { Review Comment: likewise here I think GenericStringBuilder would likely be faster -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org