This is an automated email from the ASF dual-hosted git repository.
comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 437cbf847a Optimize performance of `character_length` function (#13696)
437cbf847a is described below
commit 437cbf847a80beae791af821d73c56d8226473be
Author: Tai Le Manh <[email protected]>
AuthorDate: Tue Dec 10 09:33:25 2024 +0700
Optimize performance of `character_length` function (#13696)
* Optimize performance of function
Signed-off-by: Tai Le Manh <[email protected]>
* Add pre-check array is null
* Fix clippy warnings
---------
Signed-off-by: Tai Le Manh <[email protected]>
---
.../functions/src/unicode/character_length.rs | 57 +++++++++++++++-------
1 file changed, 39 insertions(+), 18 deletions(-)
diff --git a/datafusion/functions/src/unicode/character_length.rs
b/datafusion/functions/src/unicode/character_length.rs
index 822bdca9ac..ad51a8ef72 100644
--- a/datafusion/functions/src/unicode/character_length.rs
+++ b/datafusion/functions/src/unicode/character_length.rs
@@ -18,7 +18,7 @@
use crate::strings::StringArrayType;
use crate::utils::{make_scalar_function, utf8_to_int_type};
use arrow::array::{
- Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait,
PrimitiveArray,
+ Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait,
PrimitiveBuilder,
};
use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
use datafusion_common::Result;
@@ -136,31 +136,52 @@ fn character_length(args: &[ArrayRef]) ->
Result<ArrayRef> {
}
}
-fn character_length_general<'a, T: ArrowPrimitiveType, V: StringArrayType<'a>>(
- array: V,
-) -> Result<ArrayRef>
+fn character_length_general<'a, T, V>(array: V) -> Result<ArrayRef>
where
+ T: ArrowPrimitiveType,
T::Native: OffsetSizeTrait,
+ V: StringArrayType<'a>,
{
+ let mut builder = PrimitiveBuilder::<T>::with_capacity(array.len());
+
// String characters are variable length encoded in UTF-8, counting the
// number of chars requires expensive decoding, however checking if the
// string is ASCII only is relatively cheap.
// If strings are ASCII only, count bytes instead.
let is_array_ascii_only = array.is_ascii();
- let iter = array.iter();
- let result = iter
- .map(|string| {
- string.map(|string: &str| {
- if is_array_ascii_only {
- T::Native::usize_as(string.len())
- } else {
- T::Native::usize_as(string.chars().count())
- }
- })
- })
- .collect::<PrimitiveArray<T>>();
-
- Ok(Arc::new(result) as ArrayRef)
+ if array.null_count() == 0 {
+ if is_array_ascii_only {
+ for i in 0..array.len() {
+ let value = array.value(i);
+ builder.append_value(T::Native::usize_as(value.len()));
+ }
+ } else {
+ for i in 0..array.len() {
+ let value = array.value(i);
+
builder.append_value(T::Native::usize_as(value.chars().count()));
+ }
+ }
+ } else if is_array_ascii_only {
+ for i in 0..array.len() {
+ if array.is_null(i) {
+ builder.append_null();
+ } else {
+ let value = array.value(i);
+ builder.append_value(T::Native::usize_as(value.len()));
+ }
+ }
+ } else {
+ for i in 0..array.len() {
+ if array.is_null(i) {
+ builder.append_null();
+ } else {
+ let value = array.value(i);
+
builder.append_value(T::Native::usize_as(value.chars().count()));
+ }
+ }
+ }
+
+ Ok(Arc::new(builder.finish()) as ArrayRef)
}
#[cfg(test)]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]