This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 7e69580032  Add support for Utf8View to crypto functions #13406 
(#13407)
7e69580032 is described below

commit 7e69580032b138aadf1cf6975cd2004916885bd5
Author: Bruce Ritchie <[email protected]>
AuthorDate: Fri Nov 15 14:41:45 2024 -0500

     Add support for Utf8View to crypto functions #13406 (#13407)
---
 datafusion/functions/src/crypto/basic.rs           | 57 +++++++++++++++-----
 datafusion/functions/src/crypto/digest.rs          |  1 +
 datafusion/functions/src/crypto/md5.rs             |  4 +-
 datafusion/functions/src/crypto/sha224.rs          |  2 +-
 datafusion/functions/src/crypto/sha256.rs          |  2 +-
 datafusion/functions/src/crypto/sha384.rs          |  2 +-
 datafusion/functions/src/crypto/sha512.rs          |  2 +-
 datafusion/sqllogictest/test_files/expr.slt        |  5 ++
 .../sqllogictest/test_files/string/string_view.slt | 60 ++++++++++++++++++++++
 9 files changed, 116 insertions(+), 19 deletions(-)

diff --git a/datafusion/functions/src/crypto/basic.rs 
b/datafusion/functions/src/crypto/basic.rs
index 716afd84a9..74dc5d517c 100644
--- a/datafusion/functions/src/crypto/basic.rs
+++ b/datafusion/functions/src/crypto/basic.rs
@@ -17,17 +17,18 @@
 
 //! "crypto" DataFusion functions
 
-use arrow::array::StringArray;
 use arrow::array::{Array, ArrayRef, BinaryArray, OffsetSizeTrait};
+use arrow::array::{AsArray, GenericStringArray, StringArray, StringViewArray};
 use arrow::datatypes::DataType;
 use blake2::{Blake2b512, Blake2s256, Digest};
 use blake3::Hasher as Blake3;
 use datafusion_common::cast::as_binary_array;
 
+use arrow::compute::StringArrayType;
 use datafusion_common::plan_err;
 use datafusion_common::{
-    cast::{as_generic_binary_array, as_generic_string_array},
-    exec_err, internal_err, DataFusionError, Result, ScalarValue,
+    cast::as_generic_binary_array, exec_err, internal_err, DataFusionError, 
Result,
+    ScalarValue,
 };
 use datafusion_expr::ColumnarValue;
 use md5::Md5;
@@ -121,9 +122,9 @@ pub fn digest(args: &[ColumnarValue]) -> 
Result<ColumnarValue> {
     }
     let digest_algorithm = match &args[1] {
         ColumnarValue::Scalar(scalar) => match scalar {
-            ScalarValue::Utf8(Some(method)) | 
ScalarValue::LargeUtf8(Some(method)) => {
-                method.parse::<DigestAlgorithm>()
-            }
+            ScalarValue::Utf8View(Some(method))
+            | ScalarValue::Utf8(Some(method))
+            | ScalarValue::LargeUtf8(Some(method)) => 
method.parse::<DigestAlgorithm>(),
             other => exec_err!("Unsupported data type {other:?} for function 
digest"),
         },
         ColumnarValue::Array(_) => {
@@ -132,6 +133,7 @@ pub fn digest(args: &[ColumnarValue]) -> 
Result<ColumnarValue> {
     }?;
     digest_process(&args[0], digest_algorithm)
 }
+
 impl FromStr for DigestAlgorithm {
     type Err = DataFusionError;
     fn from_str(name: &str) -> Result<DigestAlgorithm> {
@@ -166,12 +168,14 @@ impl FromStr for DigestAlgorithm {
         })
     }
 }
+
 impl fmt::Display for DigestAlgorithm {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(f, "{}", format!("{self:?}").to_lowercase())
     }
 }
-// /// computes md5 hash digest of the given input
+
+/// computes md5 hash digest of the given input
 pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
     if args.len() != 1 {
         return exec_err!(
@@ -180,7 +184,9 @@ pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> 
{
             DigestAlgorithm::Md5
         );
     }
+
     let value = digest_process(&args[0], DigestAlgorithm::Md5)?;
+
     // md5 requires special handling because of its unique utf8 return type
     Ok(match value {
         ColumnarValue::Array(array) => {
@@ -214,7 +220,8 @@ pub fn utf8_or_binary_to_binary_type(
     name: &str,
 ) -> Result<DataType> {
     Ok(match arg_type {
-        DataType::LargeUtf8
+        DataType::Utf8View
+        | DataType::LargeUtf8
         | DataType::Utf8
         | DataType::Binary
         | DataType::LargeBinary => DataType::Binary,
@@ -296,8 +303,30 @@ impl DigestAlgorithm {
     where
         T: OffsetSizeTrait,
     {
-        let input_value = as_generic_string_array::<T>(value)?;
-        let array: ArrayRef = match self {
+        let array = match value.data_type() {
+            DataType::Utf8 | DataType::LargeUtf8 => {
+                let v = value.as_string::<T>();
+                self.digest_utf8_array_impl::<&GenericStringArray<T>>(v)
+            }
+            DataType::Utf8View => {
+                let v = value.as_string_view();
+                self.digest_utf8_array_impl::<&StringViewArray>(v)
+            }
+            other => {
+                return exec_err!("unsupported type for digest_utf_array: 
{other:?}")
+            }
+        };
+        Ok(ColumnarValue::Array(array))
+    }
+
+    pub fn digest_utf8_array_impl<'a, StringArrType>(
+        self,
+        input_value: StringArrType,
+    ) -> ArrayRef
+    where
+        StringArrType: StringArrayType<'a>,
+    {
+        match self {
             Self::Md5 => digest_to_array!(Md5, input_value),
             Self::Sha224 => digest_to_array!(Sha224, input_value),
             Self::Sha256 => digest_to_array!(Sha256, input_value),
@@ -318,8 +347,7 @@ impl DigestAlgorithm {
                     .collect();
                 Arc::new(binary_array)
             }
-        };
-        Ok(ColumnarValue::Array(array))
+        }
     }
 }
 pub fn digest_process(
@@ -328,6 +356,7 @@ pub fn digest_process(
 ) -> Result<ColumnarValue> {
     match value {
         ColumnarValue::Array(a) => match a.data_type() {
+            DataType::Utf8View => 
digest_algorithm.digest_utf8_array::<i32>(a.as_ref()),
             DataType::Utf8 => 
digest_algorithm.digest_utf8_array::<i32>(a.as_ref()),
             DataType::LargeUtf8 => 
digest_algorithm.digest_utf8_array::<i64>(a.as_ref()),
             DataType::Binary => 
digest_algorithm.digest_binary_array::<i32>(a.as_ref()),
@@ -339,7 +368,9 @@ pub fn digest_process(
             ),
         },
         ColumnarValue::Scalar(scalar) => match scalar {
-            ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => {
+            ScalarValue::Utf8View(a)
+            | ScalarValue::Utf8(a)
+            | ScalarValue::LargeUtf8(a) => {
                 Ok(digest_algorithm
                     .digest_scalar(a.as_ref().map(|s: &String| s.as_bytes())))
             }
diff --git a/datafusion/functions/src/crypto/digest.rs 
b/datafusion/functions/src/crypto/digest.rs
index 0e43fb7785..f738c6e3e4 100644
--- a/datafusion/functions/src/crypto/digest.rs
+++ b/datafusion/functions/src/crypto/digest.rs
@@ -42,6 +42,7 @@ impl DigestFunc {
         Self {
             signature: Signature::one_of(
                 vec![
+                    Exact(vec![Utf8View, Utf8View]),
                     Exact(vec![Utf8, Utf8]),
                     Exact(vec![LargeUtf8, Utf8]),
                     Exact(vec![Binary, Utf8]),
diff --git a/datafusion/functions/src/crypto/md5.rs 
b/datafusion/functions/src/crypto/md5.rs
index 062d63bcc0..0f18fd47b4 100644
--- a/datafusion/functions/src/crypto/md5.rs
+++ b/datafusion/functions/src/crypto/md5.rs
@@ -42,7 +42,7 @@ impl Md5Func {
         Self {
             signature: Signature::uniform(
                 1,
-                vec![Utf8, LargeUtf8, Binary, LargeBinary],
+                vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
                 Volatility::Immutable,
             ),
         }
@@ -65,7 +65,7 @@ impl ScalarUDFImpl for Md5Func {
         use DataType::*;
         Ok(match &arg_types[0] {
             LargeUtf8 | LargeBinary => LargeUtf8,
-            Utf8 | Binary => Utf8,
+            Utf8View | Utf8 | Binary => Utf8,
             Null => Null,
             Dictionary(_, t) => match **t {
                 LargeUtf8 | LargeBinary => LargeUtf8,
diff --git a/datafusion/functions/src/crypto/sha224.rs 
b/datafusion/functions/src/crypto/sha224.rs
index 39202d5bf6..f0bfcb9fab 100644
--- a/datafusion/functions/src/crypto/sha224.rs
+++ b/datafusion/functions/src/crypto/sha224.rs
@@ -43,7 +43,7 @@ impl SHA224Func {
         Self {
             signature: Signature::uniform(
                 1,
-                vec![Utf8, LargeUtf8, Binary, LargeBinary],
+                vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
                 Volatility::Immutable,
             ),
         }
diff --git a/datafusion/functions/src/crypto/sha256.rs 
b/datafusion/functions/src/crypto/sha256.rs
index 74deb3fc6c..0a0044f722 100644
--- a/datafusion/functions/src/crypto/sha256.rs
+++ b/datafusion/functions/src/crypto/sha256.rs
@@ -42,7 +42,7 @@ impl SHA256Func {
         Self {
             signature: Signature::uniform(
                 1,
-                vec![Utf8, LargeUtf8, Binary, LargeBinary],
+                vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
                 Volatility::Immutable,
             ),
         }
diff --git a/datafusion/functions/src/crypto/sha384.rs 
b/datafusion/functions/src/crypto/sha384.rs
index 9b1e1ba9ec..7f8220e5f9 100644
--- a/datafusion/functions/src/crypto/sha384.rs
+++ b/datafusion/functions/src/crypto/sha384.rs
@@ -42,7 +42,7 @@ impl SHA384Func {
         Self {
             signature: Signature::uniform(
                 1,
-                vec![Utf8, LargeUtf8, Binary, LargeBinary],
+                vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
                 Volatility::Immutable,
             ),
         }
diff --git a/datafusion/functions/src/crypto/sha512.rs 
b/datafusion/functions/src/crypto/sha512.rs
index c88579fd08..d2d51bfa53 100644
--- a/datafusion/functions/src/crypto/sha512.rs
+++ b/datafusion/functions/src/crypto/sha512.rs
@@ -42,7 +42,7 @@ impl SHA512Func {
         Self {
             signature: Signature::uniform(
                 1,
-                vec![Utf8, LargeUtf8, Binary, LargeBinary],
+                vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
                 Volatility::Immutable,
             ),
         }
diff --git a/datafusion/sqllogictest/test_files/expr.slt 
b/datafusion/sqllogictest/test_files/expr.slt
index c653113fd4..15bf771c65 100644
--- a/datafusion/sqllogictest/test_files/expr.slt
+++ b/datafusion/sqllogictest/test_files/expr.slt
@@ -2225,6 +2225,11 @@ SELECT digest('','blake3');
 ----
 af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262
 
+# vverify utf8view 
+query ?
+SELECT sha224(arrow_cast('tom', 'Utf8View'));
+----
+0bf6cb62649c42a9ae3876ab6f6d92ad36cb5414e495f8873292be4d
 
 query T
 SELECT substring('alphabet', 1)
diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt 
b/datafusion/sqllogictest/test_files/string/string_view.slt
index 12295a01a9..5a08f3f544 100644
--- a/datafusion/sqllogictest/test_files/string/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -963,6 +963,66 @@ logical_plan
 01)Projection: nullif(test.column1_utf8view, test.column1_utf8view) AS c
 02)--TableScan: test projection=[column1_utf8view]
 
+## Ensure no casts for md5
+query TT
+EXPLAIN SELECT
+  md5(column1_utf8view) as c
+FROM test;
+----
+logical_plan
+01)Projection: md5(test.column1_utf8view) AS c
+02)--TableScan: test projection=[column1_utf8view]
+
+## Ensure no casts for sha224
+query TT
+EXPLAIN SELECT
+  sha224(column1_utf8view) as c
+FROM test;
+----
+logical_plan
+01)Projection: sha224(test.column1_utf8view) AS c
+02)--TableScan: test projection=[column1_utf8view]
+
+## Ensure no casts for sha256
+query TT
+EXPLAIN SELECT
+  sha256(column1_utf8view) as c
+FROM test;
+----
+logical_plan
+01)Projection: sha256(test.column1_utf8view) AS c
+02)--TableScan: test projection=[column1_utf8view]
+
+## Ensure no casts for sha384
+query TT
+EXPLAIN SELECT
+  sha384(column1_utf8view) as c
+FROM test;
+----
+logical_plan
+01)Projection: sha384(test.column1_utf8view) AS c
+02)--TableScan: test projection=[column1_utf8view]
+
+## Ensure no casts for sha512
+query TT
+EXPLAIN SELECT
+  sha512(column1_utf8view) as c
+FROM test;
+----
+logical_plan
+01)Projection: sha512(test.column1_utf8view) AS c
+02)--TableScan: test projection=[column1_utf8view]
+
+## Ensure no casts for digest
+query TT
+EXPLAIN SELECT
+  digest(column1_utf8view, 'md5') as c
+FROM test;
+----
+logical_plan
+01)Projection: digest(test.column1_utf8view, Utf8View("md5")) AS c
+02)--TableScan: test projection=[column1_utf8view]
+
 ## Ensure no casts for binary operators
 # `~` operator (regex match)
 query TT


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to