This is an automated email from the ASF dual-hosted git repository.
comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new d841116fb7 Allow ColumnarValue to array conversion with less copying
(#13644)
d841116fb7 is described below
commit d841116fb7d3f8a568f10e7f584b7d5877de4b71
Author: Piotr Findeisen <[email protected]>
AuthorDate: Wed Dec 4 19:42:40 2024 +0100
Allow ColumnarValue to array conversion with less copying (#13644)
`ColumnarValue::into_array` takes ownership of the columnar value and so
requires copying data if the call site doesn't own the value. This does
not matter in many cases, since the `into_array` internally will copy
data even more. It does matter however for the cases when to-array is
called with desired array length of 1, which may happen when UDF
implementation attempts to normalize arguments into arrays without
expanding them. This pattern can be seen in regexp functions, but can be
useful in downstream projects too.
---
datafusion/expr-common/src/columnar_value.rs | 20 +++++++++++++++++++-
datafusion/functions-nested/src/array_has.rs | 4 ++--
datafusion/functions/src/regex/regexpcount.rs | 2 +-
datafusion/functions/src/regex/regexplike.rs | 2 +-
datafusion/functions/src/regex/regexpmatch.rs | 2 +-
datafusion/functions/src/regex/regexpreplace.rs | 4 ++--
datafusion/functions/src/utils.rs | 4 ++--
7 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/datafusion/expr-common/src/columnar_value.rs
b/datafusion/expr-common/src/columnar_value.rs
index 4b9454ed73..3b17e60654 100644
--- a/datafusion/expr-common/src/columnar_value.rs
+++ b/datafusion/expr-common/src/columnar_value.rs
@@ -129,6 +129,24 @@ impl ColumnarValue {
})
}
+ /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
+ /// number of rows. [`Self::Scalar`] is converted by repeating the same
+ /// scalar multiple times which is not as efficient as handling the scalar
+ /// directly.
+ ///
+ /// See [`Self::values_to_arrays`] to convert multiple columnar values into
+ /// arrays of the same length.
+ ///
+ /// # Errors
+ ///
+ /// Errors if `self` is a Scalar that fails to be converted into an array
of size
+ pub fn to_array(&self, num_rows: usize) -> Result<ArrayRef> {
+ Ok(match self {
+ ColumnarValue::Array(array) => Arc::clone(array),
+ ColumnarValue::Scalar(scalar) =>
scalar.to_array_of_size(num_rows)?,
+ })
+ }
+
/// Null columnar values are implemented as a null array in order to pass
batch
/// num_rows
pub fn create_null_array(num_rows: usize) -> Self {
@@ -176,7 +194,7 @@ impl ColumnarValue {
let args = args
.iter()
- .map(|arg| arg.clone().into_array(inferred_length))
+ .map(|arg| arg.to_array(inferred_length))
.collect::<Result<Vec<_>>>()?;
Ok(args)
diff --git a/datafusion/functions-nested/src/array_has.rs
b/datafusion/functions-nested/src/array_has.rs
index c71314d826..499b07dafc 100644
--- a/datafusion/functions-nested/src/array_has.rs
+++ b/datafusion/functions-nested/src/array_has.rs
@@ -106,7 +106,7 @@ impl ScalarUDFImpl for ArrayHas {
match &args[1] {
ColumnarValue::Array(array_needle) => {
// the needle is already an array, convert the haystack to an
array of the same length
- let haystack =
args[0].to_owned().into_array(array_needle.len())?;
+ let haystack = args[0].to_array(array_needle.len())?;
let array = array_has_inner_for_array(&haystack,
array_needle)?;
Ok(ColumnarValue::Array(array))
}
@@ -118,7 +118,7 @@ impl ScalarUDFImpl for ArrayHas {
}
// since the needle is a scalar, convert it to an array of
size 1
- let haystack = args[0].to_owned().into_array(1)?;
+ let haystack = args[0].to_array(1)?;
let needle = scalar_needle.to_array_of_size(1)?;
let needle = Scalar::new(needle);
let array = array_has_inner_for_scalar(&haystack, &needle)?;
diff --git a/datafusion/functions/src/regex/regexpcount.rs
b/datafusion/functions/src/regex/regexpcount.rs
index a667d70e7b..8f06c75b2f 100644
--- a/datafusion/functions/src/regex/regexpcount.rs
+++ b/datafusion/functions/src/regex/regexpcount.rs
@@ -97,7 +97,7 @@ impl ScalarUDFImpl for RegexpCountFunc {
let inferred_length = len.unwrap_or(1);
let args = args
.iter()
- .map(|arg| arg.clone().into_array(inferred_length))
+ .map(|arg| arg.to_array(inferred_length))
.collect::<Result<Vec<_>>>()?;
let result = regexp_count_func(&args);
diff --git a/datafusion/functions/src/regex/regexplike.rs
b/datafusion/functions/src/regex/regexplike.rs
index adbd6ef94d..49e57776c7 100644
--- a/datafusion/functions/src/regex/regexplike.rs
+++ b/datafusion/functions/src/regex/regexplike.rs
@@ -147,7 +147,7 @@ impl ScalarUDFImpl for RegexpLikeFunc {
let inferred_length = len.unwrap_or(1);
let args = args
.iter()
- .map(|arg| arg.clone().into_array(inferred_length))
+ .map(|arg| arg.to_array(inferred_length))
.collect::<Result<Vec<_>>>()?;
let result = regexp_like(&args);
diff --git a/datafusion/functions/src/regex/regexpmatch.rs
b/datafusion/functions/src/regex/regexpmatch.rs
index 93178d23de..8362ef2f40 100644
--- a/datafusion/functions/src/regex/regexpmatch.rs
+++ b/datafusion/functions/src/regex/regexpmatch.rs
@@ -99,7 +99,7 @@ impl ScalarUDFImpl for RegexpMatchFunc {
let inferred_length = len.unwrap_or(1);
let args = args
.iter()
- .map(|arg| arg.clone().into_array(inferred_length))
+ .map(|arg| arg.to_array(inferred_length))
.collect::<Result<Vec<_>>>()?;
let result = regexp_match_func(&args);
diff --git a/datafusion/functions/src/regex/regexpreplace.rs
b/datafusion/functions/src/regex/regexpreplace.rs
index 3f289e7c15..af02fa4934 100644
--- a/datafusion/functions/src/regex/regexpreplace.rs
+++ b/datafusion/functions/src/regex/regexpreplace.rs
@@ -575,7 +575,7 @@ pub fn specialize_regexp_replace<T: OffsetSizeTrait>(
Hint::AcceptsSingular => 1,
Hint::Pad => inferred_length,
};
- arg.clone().into_array(expansion_len)
+ arg.to_array(expansion_len)
})
.collect::<Result<Vec<_>>>()?;
_regexp_replace_static_pattern_replace::<T>(&args)
@@ -586,7 +586,7 @@ pub fn specialize_regexp_replace<T: OffsetSizeTrait>(
(_, _, _, _) => {
let args = args
.iter()
- .map(|arg| arg.clone().into_array(inferred_length))
+ .map(|arg| arg.to_array(inferred_length))
.collect::<Result<Vec<_>>>()?;
match args[0].data_type() {
diff --git a/datafusion/functions/src/utils.rs
b/datafusion/functions/src/utils.rs
index 8b47350041..53f6074922 100644
--- a/datafusion/functions/src/utils.rs
+++ b/datafusion/functions/src/utils.rs
@@ -105,7 +105,7 @@ where
Hint::AcceptsSingular => 1,
Hint::Pad => inferred_length,
};
- arg.clone().into_array(expansion_len)
+ arg.to_array(expansion_len)
})
.collect::<Result<Vec<_>>>()?;
@@ -152,7 +152,7 @@ pub mod test {
let result =
func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{args: $ARGS,
number_rows: cardinality, return_type: &return_type});
assert_eq!(result.is_ok(), true, "function returned an
error: {}", result.unwrap_err());
- let result =
result.unwrap().clone().into_array(cardinality).expect("Failed to convert to
array");
+ let result =
result.unwrap().to_array(cardinality).expect("Failed to convert to array");
let result =
result.as_any().downcast_ref::<$ARRAY_TYPE>().expect("Failed to convert to
type");
// value is correct
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]