Omega359 commented on code in PR #11941: URL: https://github.com/apache/datafusion/pull/11941#discussion_r1714624067
########## datafusion/functions/src/unicode/lpad.rs: ########## @@ -76,300 +87,450 @@ impl ScalarUDFImpl for LPadFunc { } fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> { - match args[0].data_type() { - DataType::Utf8 => make_scalar_function(lpad::<i32>, vec![])(args), - DataType::LargeUtf8 => make_scalar_function(lpad::<i64>, vec![])(args), - other => exec_err!("Unsupported data type {other:?} for function lpad"), - } + make_scalar_function(lpad, vec![])(args) } } -/// Extends the string to length 'length' by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right). +/// Extends the string to length 'length' by prepending the characters fill (a space by default). +/// If the string is already longer than length then it is truncated (on the right). /// lpad('hi', 5, 'xy') = 'xyxhi' -pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> { - match args.len() { - 2 => { - let string_array = as_generic_string_array::<T>(&args[0])?; - let length_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(length_array.iter()) - .map(|(string, length)| match (string, length) { - (Some(string), Some(length)) => { - if length > i32::MAX as i64 { - return exec_err!( - "lpad requested length {length} too large" - ); - } +pub fn lpad(args: &[ArrayRef]) -> Result<ArrayRef> { + if args.len() <= 1 || args.len() > 3 { + return exec_err!( + "lpad was called with {} arguments. It requires at least 2 and at most 3.", + args.len() + ); + } + + let length_array = as_int64_array(&args[1])?; + + match args[0].data_type() { + Utf8 => match args.len() { + 2 => lpad_impl::<&GenericStringArray<i32>, &GenericStringArray<i32>, i32>( + args[0].as_string::<i32>(), + length_array, + None, + ), + 3 => lpad_with_replace::<&GenericStringArray<i32>, i32>( + args[0].as_string::<i32>(), + length_array, + &args[2], + ), + _ => unreachable!(), + }, + LargeUtf8 => match args.len() { + 2 => lpad_impl::<&GenericStringArray<i64>, &GenericStringArray<i64>, i64>( + args[0].as_string::<i64>(), + length_array, + None, + ), + 3 => lpad_with_replace::<&GenericStringArray<i64>, i64>( + args[0].as_string::<i64>(), + length_array, + &args[2], + ), + _ => unreachable!(), + }, + Utf8View => match args.len() { + 2 => lpad_impl::<&StringViewArray, &GenericStringArray<i32>, i32>( + args[0].as_string_view(), + length_array, + None, + ), + 3 => lpad_with_replace::<&StringViewArray, i32>( + args[0].as_string_view(), + length_array, + &args[2], + ), + _ => unreachable!(), + }, + other => { + exec_err!("Unsupported data type {other:?} for function lpad") + } + } +} - let length = if length < 0 { 0 } else { length as usize }; - if length == 0 { - Ok(Some("".to_string())) +fn lpad_with_replace<'a, V, T: OffsetSizeTrait>( + string_array: V, + length_array: &Int64Array, + fill_array: &'a ArrayRef, +) -> Result<ArrayRef> +where + V: StringArrayType<'a>, +{ + match fill_array.data_type() { + Utf8 => lpad_impl::<V, &GenericStringArray<i32>, T>( + string_array, + length_array, + Some(fill_array.as_string::<i32>()), + ), + LargeUtf8 => lpad_impl::<V, &GenericStringArray<i64>, T>( + string_array, + length_array, + Some(fill_array.as_string::<i64>()), + ), + Utf8View => lpad_impl::<V, &StringViewArray, T>( + string_array, + length_array, + Some(fill_array.as_string_view()), + ), + other => { + exec_err!("Unsupported data type {other:?} for function lpad") + } + } +} + +fn lpad_impl<'a, V, V2, T>( + string_array: V, + length_array: &Int64Array, + fill_array: Option<V2>, +) -> Result<ArrayRef> +where + V: StringArrayType<'a>, + V2: StringArrayType<'a>, + T: OffsetSizeTrait, +{ + if fill_array.is_none() { + let result = string_array + .iter() + .zip(length_array.iter()) + .map(|(string, length)| match (string, length) { + (Some(string), Some(length)) => { + if length > i32::MAX as i64 { + return exec_err!("lpad requested length {length} too large"); + } + + let length = if length < 0 { 0 } else { length as usize }; + if length == 0 { + Ok(Some("".to_string())) + } else { + let graphemes = string.graphemes(true).collect::<Vec<&str>>(); + if length < graphemes.len() { + Ok(Some(graphemes[..length].concat())) } else { - let graphemes = string.graphemes(true).collect::<Vec<&str>>(); - if length < graphemes.len() { - Ok(Some(graphemes[..length].concat())) - } else { - let mut s: String = " ".repeat(length - graphemes.len()); - s.push_str(string); - Ok(Some(s)) - } + let mut s: String = " ".repeat(length - graphemes.len()); Review Comment: I did a test locally with using `GenericStringBuilder` vs what is in this PR and the differences are within +/- 10% for 1024 and 2048 element arrays for each of the 3 string types. Worth noting though is that timing between runs on my laptop is not really stable at all (+/- 10% or so between runs of the exact same code). Interestingly the utf8view seems to be consistently slightly slower than just plain utf8 at least in this function. ``` lpad function/utf8 type/2048 time: [715.78 µs 718.43 µs 721.89 µs] lpad function/largeutf8 type/2048 time: [745.44 µs 759.62 µs 779.84 µs] lpad function/stringview type/2048 time: [734.25 µs 751.13 µs 773.67 µs] ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org