This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 7e69580032 Add support for Utf8View to crypto functions #13406
(#13407)
7e69580032 is described below
commit 7e69580032b138aadf1cf6975cd2004916885bd5
Author: Bruce Ritchie <[email protected]>
AuthorDate: Fri Nov 15 14:41:45 2024 -0500
Add support for Utf8View to crypto functions #13406 (#13407)
---
datafusion/functions/src/crypto/basic.rs | 57 +++++++++++++++-----
datafusion/functions/src/crypto/digest.rs | 1 +
datafusion/functions/src/crypto/md5.rs | 4 +-
datafusion/functions/src/crypto/sha224.rs | 2 +-
datafusion/functions/src/crypto/sha256.rs | 2 +-
datafusion/functions/src/crypto/sha384.rs | 2 +-
datafusion/functions/src/crypto/sha512.rs | 2 +-
datafusion/sqllogictest/test_files/expr.slt | 5 ++
.../sqllogictest/test_files/string/string_view.slt | 60 ++++++++++++++++++++++
9 files changed, 116 insertions(+), 19 deletions(-)
diff --git a/datafusion/functions/src/crypto/basic.rs
b/datafusion/functions/src/crypto/basic.rs
index 716afd84a9..74dc5d517c 100644
--- a/datafusion/functions/src/crypto/basic.rs
+++ b/datafusion/functions/src/crypto/basic.rs
@@ -17,17 +17,18 @@
//! "crypto" DataFusion functions
-use arrow::array::StringArray;
use arrow::array::{Array, ArrayRef, BinaryArray, OffsetSizeTrait};
+use arrow::array::{AsArray, GenericStringArray, StringArray, StringViewArray};
use arrow::datatypes::DataType;
use blake2::{Blake2b512, Blake2s256, Digest};
use blake3::Hasher as Blake3;
use datafusion_common::cast::as_binary_array;
+use arrow::compute::StringArrayType;
use datafusion_common::plan_err;
use datafusion_common::{
- cast::{as_generic_binary_array, as_generic_string_array},
- exec_err, internal_err, DataFusionError, Result, ScalarValue,
+ cast::as_generic_binary_array, exec_err, internal_err, DataFusionError,
Result,
+ ScalarValue,
};
use datafusion_expr::ColumnarValue;
use md5::Md5;
@@ -121,9 +122,9 @@ pub fn digest(args: &[ColumnarValue]) ->
Result<ColumnarValue> {
}
let digest_algorithm = match &args[1] {
ColumnarValue::Scalar(scalar) => match scalar {
- ScalarValue::Utf8(Some(method)) |
ScalarValue::LargeUtf8(Some(method)) => {
- method.parse::<DigestAlgorithm>()
- }
+ ScalarValue::Utf8View(Some(method))
+ | ScalarValue::Utf8(Some(method))
+ | ScalarValue::LargeUtf8(Some(method)) =>
method.parse::<DigestAlgorithm>(),
other => exec_err!("Unsupported data type {other:?} for function
digest"),
},
ColumnarValue::Array(_) => {
@@ -132,6 +133,7 @@ pub fn digest(args: &[ColumnarValue]) ->
Result<ColumnarValue> {
}?;
digest_process(&args[0], digest_algorithm)
}
+
impl FromStr for DigestAlgorithm {
type Err = DataFusionError;
fn from_str(name: &str) -> Result<DigestAlgorithm> {
@@ -166,12 +168,14 @@ impl FromStr for DigestAlgorithm {
})
}
}
+
impl fmt::Display for DigestAlgorithm {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", format!("{self:?}").to_lowercase())
}
}
-// /// computes md5 hash digest of the given input
+
+/// computes md5 hash digest of the given input
pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
if args.len() != 1 {
return exec_err!(
@@ -180,7 +184,9 @@ pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue>
{
DigestAlgorithm::Md5
);
}
+
let value = digest_process(&args[0], DigestAlgorithm::Md5)?;
+
// md5 requires special handling because of its unique utf8 return type
Ok(match value {
ColumnarValue::Array(array) => {
@@ -214,7 +220,8 @@ pub fn utf8_or_binary_to_binary_type(
name: &str,
) -> Result<DataType> {
Ok(match arg_type {
- DataType::LargeUtf8
+ DataType::Utf8View
+ | DataType::LargeUtf8
| DataType::Utf8
| DataType::Binary
| DataType::LargeBinary => DataType::Binary,
@@ -296,8 +303,30 @@ impl DigestAlgorithm {
where
T: OffsetSizeTrait,
{
- let input_value = as_generic_string_array::<T>(value)?;
- let array: ArrayRef = match self {
+ let array = match value.data_type() {
+ DataType::Utf8 | DataType::LargeUtf8 => {
+ let v = value.as_string::<T>();
+ self.digest_utf8_array_impl::<&GenericStringArray<T>>(v)
+ }
+ DataType::Utf8View => {
+ let v = value.as_string_view();
+ self.digest_utf8_array_impl::<&StringViewArray>(v)
+ }
+ other => {
+ return exec_err!("unsupported type for digest_utf_array:
{other:?}")
+ }
+ };
+ Ok(ColumnarValue::Array(array))
+ }
+
+ pub fn digest_utf8_array_impl<'a, StringArrType>(
+ self,
+ input_value: StringArrType,
+ ) -> ArrayRef
+ where
+ StringArrType: StringArrayType<'a>,
+ {
+ match self {
Self::Md5 => digest_to_array!(Md5, input_value),
Self::Sha224 => digest_to_array!(Sha224, input_value),
Self::Sha256 => digest_to_array!(Sha256, input_value),
@@ -318,8 +347,7 @@ impl DigestAlgorithm {
.collect();
Arc::new(binary_array)
}
- };
- Ok(ColumnarValue::Array(array))
+ }
}
}
pub fn digest_process(
@@ -328,6 +356,7 @@ pub fn digest_process(
) -> Result<ColumnarValue> {
match value {
ColumnarValue::Array(a) => match a.data_type() {
+ DataType::Utf8View =>
digest_algorithm.digest_utf8_array::<i32>(a.as_ref()),
DataType::Utf8 =>
digest_algorithm.digest_utf8_array::<i32>(a.as_ref()),
DataType::LargeUtf8 =>
digest_algorithm.digest_utf8_array::<i64>(a.as_ref()),
DataType::Binary =>
digest_algorithm.digest_binary_array::<i32>(a.as_ref()),
@@ -339,7 +368,9 @@ pub fn digest_process(
),
},
ColumnarValue::Scalar(scalar) => match scalar {
- ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => {
+ ScalarValue::Utf8View(a)
+ | ScalarValue::Utf8(a)
+ | ScalarValue::LargeUtf8(a) => {
Ok(digest_algorithm
.digest_scalar(a.as_ref().map(|s: &String| s.as_bytes())))
}
diff --git a/datafusion/functions/src/crypto/digest.rs
b/datafusion/functions/src/crypto/digest.rs
index 0e43fb7785..f738c6e3e4 100644
--- a/datafusion/functions/src/crypto/digest.rs
+++ b/datafusion/functions/src/crypto/digest.rs
@@ -42,6 +42,7 @@ impl DigestFunc {
Self {
signature: Signature::one_of(
vec![
+ Exact(vec![Utf8View, Utf8View]),
Exact(vec![Utf8, Utf8]),
Exact(vec![LargeUtf8, Utf8]),
Exact(vec![Binary, Utf8]),
diff --git a/datafusion/functions/src/crypto/md5.rs
b/datafusion/functions/src/crypto/md5.rs
index 062d63bcc0..0f18fd47b4 100644
--- a/datafusion/functions/src/crypto/md5.rs
+++ b/datafusion/functions/src/crypto/md5.rs
@@ -42,7 +42,7 @@ impl Md5Func {
Self {
signature: Signature::uniform(
1,
- vec![Utf8, LargeUtf8, Binary, LargeBinary],
+ vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
Volatility::Immutable,
),
}
@@ -65,7 +65,7 @@ impl ScalarUDFImpl for Md5Func {
use DataType::*;
Ok(match &arg_types[0] {
LargeUtf8 | LargeBinary => LargeUtf8,
- Utf8 | Binary => Utf8,
+ Utf8View | Utf8 | Binary => Utf8,
Null => Null,
Dictionary(_, t) => match **t {
LargeUtf8 | LargeBinary => LargeUtf8,
diff --git a/datafusion/functions/src/crypto/sha224.rs
b/datafusion/functions/src/crypto/sha224.rs
index 39202d5bf6..f0bfcb9fab 100644
--- a/datafusion/functions/src/crypto/sha224.rs
+++ b/datafusion/functions/src/crypto/sha224.rs
@@ -43,7 +43,7 @@ impl SHA224Func {
Self {
signature: Signature::uniform(
1,
- vec![Utf8, LargeUtf8, Binary, LargeBinary],
+ vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
Volatility::Immutable,
),
}
diff --git a/datafusion/functions/src/crypto/sha256.rs
b/datafusion/functions/src/crypto/sha256.rs
index 74deb3fc6c..0a0044f722 100644
--- a/datafusion/functions/src/crypto/sha256.rs
+++ b/datafusion/functions/src/crypto/sha256.rs
@@ -42,7 +42,7 @@ impl SHA256Func {
Self {
signature: Signature::uniform(
1,
- vec![Utf8, LargeUtf8, Binary, LargeBinary],
+ vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
Volatility::Immutable,
),
}
diff --git a/datafusion/functions/src/crypto/sha384.rs
b/datafusion/functions/src/crypto/sha384.rs
index 9b1e1ba9ec..7f8220e5f9 100644
--- a/datafusion/functions/src/crypto/sha384.rs
+++ b/datafusion/functions/src/crypto/sha384.rs
@@ -42,7 +42,7 @@ impl SHA384Func {
Self {
signature: Signature::uniform(
1,
- vec![Utf8, LargeUtf8, Binary, LargeBinary],
+ vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
Volatility::Immutable,
),
}
diff --git a/datafusion/functions/src/crypto/sha512.rs
b/datafusion/functions/src/crypto/sha512.rs
index c88579fd08..d2d51bfa53 100644
--- a/datafusion/functions/src/crypto/sha512.rs
+++ b/datafusion/functions/src/crypto/sha512.rs
@@ -42,7 +42,7 @@ impl SHA512Func {
Self {
signature: Signature::uniform(
1,
- vec![Utf8, LargeUtf8, Binary, LargeBinary],
+ vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary],
Volatility::Immutable,
),
}
diff --git a/datafusion/sqllogictest/test_files/expr.slt
b/datafusion/sqllogictest/test_files/expr.slt
index c653113fd4..15bf771c65 100644
--- a/datafusion/sqllogictest/test_files/expr.slt
+++ b/datafusion/sqllogictest/test_files/expr.slt
@@ -2225,6 +2225,11 @@ SELECT digest('','blake3');
----
af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262
+# vverify utf8view
+query ?
+SELECT sha224(arrow_cast('tom', 'Utf8View'));
+----
+0bf6cb62649c42a9ae3876ab6f6d92ad36cb5414e495f8873292be4d
query T
SELECT substring('alphabet', 1)
diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt
b/datafusion/sqllogictest/test_files/string/string_view.slt
index 12295a01a9..5a08f3f544 100644
--- a/datafusion/sqllogictest/test_files/string/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -963,6 +963,66 @@ logical_plan
01)Projection: nullif(test.column1_utf8view, test.column1_utf8view) AS c
02)--TableScan: test projection=[column1_utf8view]
+## Ensure no casts for md5
+query TT
+EXPLAIN SELECT
+ md5(column1_utf8view) as c
+FROM test;
+----
+logical_plan
+01)Projection: md5(test.column1_utf8view) AS c
+02)--TableScan: test projection=[column1_utf8view]
+
+## Ensure no casts for sha224
+query TT
+EXPLAIN SELECT
+ sha224(column1_utf8view) as c
+FROM test;
+----
+logical_plan
+01)Projection: sha224(test.column1_utf8view) AS c
+02)--TableScan: test projection=[column1_utf8view]
+
+## Ensure no casts for sha256
+query TT
+EXPLAIN SELECT
+ sha256(column1_utf8view) as c
+FROM test;
+----
+logical_plan
+01)Projection: sha256(test.column1_utf8view) AS c
+02)--TableScan: test projection=[column1_utf8view]
+
+## Ensure no casts for sha384
+query TT
+EXPLAIN SELECT
+ sha384(column1_utf8view) as c
+FROM test;
+----
+logical_plan
+01)Projection: sha384(test.column1_utf8view) AS c
+02)--TableScan: test projection=[column1_utf8view]
+
+## Ensure no casts for sha512
+query TT
+EXPLAIN SELECT
+ sha512(column1_utf8view) as c
+FROM test;
+----
+logical_plan
+01)Projection: sha512(test.column1_utf8view) AS c
+02)--TableScan: test projection=[column1_utf8view]
+
+## Ensure no casts for digest
+query TT
+EXPLAIN SELECT
+ digest(column1_utf8view, 'md5') as c
+FROM test;
+----
+logical_plan
+01)Projection: digest(test.column1_utf8view, Utf8View("md5")) AS c
+02)--TableScan: test projection=[column1_utf8view]
+
## Ensure no casts for binary operators
# `~` operator (regex match)
query TT
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]