Rachelint commented on code in PR #12395: URL: https://github.com/apache/datafusion/pull/12395#discussion_r1771904266
########## datafusion/functions/src/string/common.rs: ########## @@ -72,65 +94,126 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>( }; if use_string_view { - string_view_trim::<T>(func, args) + string_view_trim(func, args) } else { string_trim::<T>(func, args) } } // removing 'a will cause compiler complaining lifetime of `func` -fn string_view_trim<'a, T: OffsetSizeTrait>( - func: fn(&'a str, &'a str) -> &'a str, +fn string_view_trim<'a>( + trim_func: fn(&'a str, &'a str) -> &'a str, args: &'a [ArrayRef], ) -> Result<ArrayRef> { - let string_array = as_string_view_array(&args[0])?; + let string_view_array = as_string_view_array(&args[0])?; + let mut views_buf = Vec::with_capacity(string_view_array.len()); + let mut null_builder = NullBufferBuilder::new(string_view_array.len()); match args.len() { 1 => { - let result = string_array - .iter() - .map(|string| string.map(|string: &str| func(string, " "))) - .collect::<GenericStringArray<T>>(); - - Ok(Arc::new(result) as ArrayRef) + let array_iter = string_view_array.iter(); + let views_iter = string_view_array.views().iter(); + for (src_str_opt, raw_view) in array_iter.zip(views_iter) { + trim_and_append_str( + src_str_opt, + Some(" "), + trim_func, + &mut views_buf, + &mut null_builder, + raw_view, + ); + } } 2 => { let characters_array = as_string_view_array(&args[1])?; if characters_array.len() == 1 { + // Only one `trim characters` exist if characters_array.is_null(0) { return Ok(new_null_array( // The schema is expecting utf8 as null - &DataType::Utf8, - string_array.len(), + &DataType::Utf8View, + string_view_array.len(), )); } let characters = characters_array.value(0); - let result = string_array - .iter() - .map(|item| item.map(|string| func(string, characters))) - .collect::<GenericStringArray<T>>(); - return Ok(Arc::new(result) as ArrayRef); + let array_iter = string_view_array.iter(); + let views_iter = string_view_array.views().iter(); + for (src_str_opt, raw_view) in array_iter.zip(views_iter) { + trim_and_append_str( + src_str_opt, + Some(characters), + trim_func, + &mut views_buf, + &mut null_builder, + raw_view, + ); + } + } else { + // A specific `trim characters` for a row in the string view array + let characters_iter = characters_array.iter(); + let array_iter = string_view_array.iter(); + let views_iter = string_view_array.views().iter(); + for ((src_str_opt, raw_view), characters_opt) in + array_iter.zip(views_iter).zip(characters_iter) + { + trim_and_append_str( + src_str_opt, + characters_opt, + trim_func, + &mut views_buf, + &mut null_builder, + raw_view, + ); + } } - - let result = string_array - .iter() - .zip(characters_array.iter()) - .map(|(string, characters)| match (string, characters) { - (Some(string), Some(characters)) => Some(func(string, characters)), - _ => None, - }) - .collect::<GenericStringArray<T>>(); - - Ok(Arc::new(result) as ArrayRef) } other => { - exec_err!( + return exec_err!( "Function TRIM was called with {other} arguments. It requires at least 1 and at most 2." - ) + ); } } + + let views_buf = ScalarBuffer::from(views_buf); + let nulls_buf = null_builder.finish(); + + // Safety: + // (1) The blocks of the given views are all provided + // (2) Each of the range `view.offset+start..end` of view in views_buf is within + // the bounds of each of the blocks + unsafe { + let array = StringViewArray::new_unchecked( + views_buf, + string_view_array.data_buffers().to_vec(), + nulls_buf, + ); + Ok(Arc::new(array) as ArrayRef) + } +} + +fn trim_and_append_str<'a>( + src_str_opt: Option<&'a str>, + trim_characters_opt: Option<&'a str>, + trim_func: fn(&'a str, &'a str) -> &'a str, + views_buf: &mut Vec<u128>, + null_builder: &mut NullBufferBuilder, + raw: &u128, +) { + if let (Some(src_str), Some(characters)) = (src_str_opt, trim_characters_opt) { + let trim_str = trim_func(src_str, characters); + + // Safety: + // `trim_str` is computed from `str::trim_xxx_matches`, + // and its addr is ensured to be >= `origin_str`'s + let start = unsafe { trim_str.as_ptr().offset_from(src_str.as_ptr()) as u32 }; Review Comment: Sorry for my carelessness, I have added new unit test cases with the unlined string view output (len > 12). And they seems broken after removing the unsafe codes. The reason may be https://github.com/apache/datafusion/pull/12395#discussion_r1769415355 I guess maybe we can't remove the unsafe codes until the feature `Pattern` get stable https://github.com/apache/datafusion/issues/12387#issue-2513094903 I will file a issue to check it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org