This is an automated email from the ASF dual-hosted git repository.

comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new d841116fb7 Allow ColumnarValue to array conversion with less copying 
(#13644)
d841116fb7 is described below

commit d841116fb7d3f8a568f10e7f584b7d5877de4b71
Author: Piotr Findeisen <[email protected]>
AuthorDate: Wed Dec 4 19:42:40 2024 +0100

    Allow ColumnarValue to array conversion with less copying (#13644)
    
    `ColumnarValue::into_array` takes ownership of the columnar value and so
    requires copying data if the call site doesn't own the value.  This does
    not matter in many cases, since the `into_array` internally will copy
    data even more. It does matter however for the cases when to-array is
    called with desired array length of 1, which may happen when UDF
    implementation attempts to normalize arguments into arrays without
    expanding them. This pattern can be seen in regexp functions, but can be
    useful in downstream projects too.
---
 datafusion/expr-common/src/columnar_value.rs    | 20 +++++++++++++++++++-
 datafusion/functions-nested/src/array_has.rs    |  4 ++--
 datafusion/functions/src/regex/regexpcount.rs   |  2 +-
 datafusion/functions/src/regex/regexplike.rs    |  2 +-
 datafusion/functions/src/regex/regexpmatch.rs   |  2 +-
 datafusion/functions/src/regex/regexpreplace.rs |  4 ++--
 datafusion/functions/src/utils.rs               |  4 ++--
 7 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/datafusion/expr-common/src/columnar_value.rs 
b/datafusion/expr-common/src/columnar_value.rs
index 4b9454ed73..3b17e60654 100644
--- a/datafusion/expr-common/src/columnar_value.rs
+++ b/datafusion/expr-common/src/columnar_value.rs
@@ -129,6 +129,24 @@ impl ColumnarValue {
         })
     }
 
+    /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
+    /// number of rows. [`Self::Scalar`] is converted by repeating the same
+    /// scalar multiple times which is not as efficient as handling the scalar
+    /// directly.
+    ///
+    /// See [`Self::values_to_arrays`] to convert multiple columnar values into
+    /// arrays of the same length.
+    ///
+    /// # Errors
+    ///
+    /// Errors if `self` is a Scalar that fails to be converted into an array 
of size
+    pub fn to_array(&self, num_rows: usize) -> Result<ArrayRef> {
+        Ok(match self {
+            ColumnarValue::Array(array) => Arc::clone(array),
+            ColumnarValue::Scalar(scalar) => 
scalar.to_array_of_size(num_rows)?,
+        })
+    }
+
     /// Null columnar values are implemented as a null array in order to pass 
batch
     /// num_rows
     pub fn create_null_array(num_rows: usize) -> Self {
@@ -176,7 +194,7 @@ impl ColumnarValue {
 
         let args = args
             .iter()
-            .map(|arg| arg.clone().into_array(inferred_length))
+            .map(|arg| arg.to_array(inferred_length))
             .collect::<Result<Vec<_>>>()?;
 
         Ok(args)
diff --git a/datafusion/functions-nested/src/array_has.rs 
b/datafusion/functions-nested/src/array_has.rs
index c71314d826..499b07dafc 100644
--- a/datafusion/functions-nested/src/array_has.rs
+++ b/datafusion/functions-nested/src/array_has.rs
@@ -106,7 +106,7 @@ impl ScalarUDFImpl for ArrayHas {
         match &args[1] {
             ColumnarValue::Array(array_needle) => {
                 // the needle is already an array, convert the haystack to an 
array of the same length
-                let haystack = 
args[0].to_owned().into_array(array_needle.len())?;
+                let haystack = args[0].to_array(array_needle.len())?;
                 let array = array_has_inner_for_array(&haystack, 
array_needle)?;
                 Ok(ColumnarValue::Array(array))
             }
@@ -118,7 +118,7 @@ impl ScalarUDFImpl for ArrayHas {
                 }
 
                 // since the needle is a scalar, convert it to an array of 
size 1
-                let haystack = args[0].to_owned().into_array(1)?;
+                let haystack = args[0].to_array(1)?;
                 let needle = scalar_needle.to_array_of_size(1)?;
                 let needle = Scalar::new(needle);
                 let array = array_has_inner_for_scalar(&haystack, &needle)?;
diff --git a/datafusion/functions/src/regex/regexpcount.rs 
b/datafusion/functions/src/regex/regexpcount.rs
index a667d70e7b..8f06c75b2f 100644
--- a/datafusion/functions/src/regex/regexpcount.rs
+++ b/datafusion/functions/src/regex/regexpcount.rs
@@ -97,7 +97,7 @@ impl ScalarUDFImpl for RegexpCountFunc {
         let inferred_length = len.unwrap_or(1);
         let args = args
             .iter()
-            .map(|arg| arg.clone().into_array(inferred_length))
+            .map(|arg| arg.to_array(inferred_length))
             .collect::<Result<Vec<_>>>()?;
 
         let result = regexp_count_func(&args);
diff --git a/datafusion/functions/src/regex/regexplike.rs 
b/datafusion/functions/src/regex/regexplike.rs
index adbd6ef94d..49e57776c7 100644
--- a/datafusion/functions/src/regex/regexplike.rs
+++ b/datafusion/functions/src/regex/regexplike.rs
@@ -147,7 +147,7 @@ impl ScalarUDFImpl for RegexpLikeFunc {
         let inferred_length = len.unwrap_or(1);
         let args = args
             .iter()
-            .map(|arg| arg.clone().into_array(inferred_length))
+            .map(|arg| arg.to_array(inferred_length))
             .collect::<Result<Vec<_>>>()?;
 
         let result = regexp_like(&args);
diff --git a/datafusion/functions/src/regex/regexpmatch.rs 
b/datafusion/functions/src/regex/regexpmatch.rs
index 93178d23de..8362ef2f40 100644
--- a/datafusion/functions/src/regex/regexpmatch.rs
+++ b/datafusion/functions/src/regex/regexpmatch.rs
@@ -99,7 +99,7 @@ impl ScalarUDFImpl for RegexpMatchFunc {
         let inferred_length = len.unwrap_or(1);
         let args = args
             .iter()
-            .map(|arg| arg.clone().into_array(inferred_length))
+            .map(|arg| arg.to_array(inferred_length))
             .collect::<Result<Vec<_>>>()?;
 
         let result = regexp_match_func(&args);
diff --git a/datafusion/functions/src/regex/regexpreplace.rs 
b/datafusion/functions/src/regex/regexpreplace.rs
index 3f289e7c15..af02fa4934 100644
--- a/datafusion/functions/src/regex/regexpreplace.rs
+++ b/datafusion/functions/src/regex/regexpreplace.rs
@@ -575,7 +575,7 @@ pub fn specialize_regexp_replace<T: OffsetSizeTrait>(
                         Hint::AcceptsSingular => 1,
                         Hint::Pad => inferred_length,
                     };
-                    arg.clone().into_array(expansion_len)
+                    arg.to_array(expansion_len)
                 })
                 .collect::<Result<Vec<_>>>()?;
             _regexp_replace_static_pattern_replace::<T>(&args)
@@ -586,7 +586,7 @@ pub fn specialize_regexp_replace<T: OffsetSizeTrait>(
         (_, _, _, _) => {
             let args = args
                 .iter()
-                .map(|arg| arg.clone().into_array(inferred_length))
+                .map(|arg| arg.to_array(inferred_length))
                 .collect::<Result<Vec<_>>>()?;
 
             match args[0].data_type() {
diff --git a/datafusion/functions/src/utils.rs 
b/datafusion/functions/src/utils.rs
index 8b47350041..53f6074922 100644
--- a/datafusion/functions/src/utils.rs
+++ b/datafusion/functions/src/utils.rs
@@ -105,7 +105,7 @@ where
                     Hint::AcceptsSingular => 1,
                     Hint::Pad => inferred_length,
                 };
-                arg.clone().into_array(expansion_len)
+                arg.to_array(expansion_len)
             })
             .collect::<Result<Vec<_>>>()?;
 
@@ -152,7 +152,7 @@ pub mod test {
                     let result = 
func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{args: $ARGS, 
number_rows: cardinality, return_type: &return_type});
                     assert_eq!(result.is_ok(), true, "function returned an 
error: {}", result.unwrap_err());
 
-                    let result = 
result.unwrap().clone().into_array(cardinality).expect("Failed to convert to 
array");
+                    let result = 
result.unwrap().to_array(cardinality).expect("Failed to convert to array");
                     let result = 
result.as_any().downcast_ref::<$ARRAY_TYPE>().expect("Failed to convert to 
type");
 
                     // value is correct


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to