This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new d2d47d3 add blake3 function (#1086)
d2d47d3 is described below
commit d2d47d38b8c1b4605272d7f917406527cdf68bc9
Author: Jiayu Liu <[email protected]>
AuthorDate: Wed Oct 13 00:47:20 2021 +0800
add blake3 function (#1086)
---
datafusion/Cargo.toml | 3 +-
datafusion/src/physical_plan/crypto_expressions.rs | 47 ++++++++++++++++++++--
datafusion/tests/sql.rs | 4 ++
3 files changed, 50 insertions(+), 4 deletions(-)
diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml
index ecc434a..0c0b366 100644
--- a/datafusion/Cargo.toml
+++ b/datafusion/Cargo.toml
@@ -39,7 +39,7 @@ path = "src/lib.rs"
[features]
default = ["crypto_expressions", "regex_expressions", "unicode_expressions"]
simd = ["arrow/simd"]
-crypto_expressions = ["md-5", "sha2", "blake2"]
+crypto_expressions = ["md-5", "sha2", "blake2", "blake3"]
regex_expressions = ["regex", "lazy_static"]
unicode_expressions = ["unicode-segmentation"]
# Used for testing ONLY: causes all values to hash to the same value (test for
collisions)
@@ -65,6 +65,7 @@ log = "^0.4"
md-5 = { version = "^0.9.1", optional = true }
sha2 = { version = "^0.9.1", optional = true }
blake2 = { version = "^0.9.2", optional = true }
+blake3 = { version = "1.0", optional = true }
ordered-float = "2.0"
unicode-segmentation = { version = "^1.7.1", optional = true }
regex = { version = "^1.4.3", optional = true }
diff --git a/datafusion/src/physical_plan/crypto_expressions.rs
b/datafusion/src/physical_plan/crypto_expressions.rs
index 8c575bc..25d63b8 100644
--- a/datafusion/src/physical_plan/crypto_expressions.rs
+++ b/datafusion/src/physical_plan/crypto_expressions.rs
@@ -29,6 +29,7 @@ use arrow::{
datatypes::DataType,
};
use blake2::{Blake2b, Blake2s, Digest};
+use blake3::Hasher as Blake3;
use md5::Md5;
use sha2::{Sha224, Sha256, Sha384, Sha512};
use std::any::type_name;
@@ -51,6 +52,7 @@ enum DigestAlgorithm {
Sha512,
Blake2s,
Blake2b,
+ Blake3,
}
fn digest_process(
@@ -117,6 +119,11 @@ impl DigestAlgorithm {
Self::Sha512 => digest_to_scalar!(Sha512, value),
Self::Blake2b => digest_to_scalar!(Blake2b, value),
Self::Blake2s => digest_to_scalar!(Blake2s, value),
+ Self::Blake3 => ScalarValue::Binary(value.as_ref().map(|v| {
+ let mut digest = Blake3::default();
+ digest.update(v.as_bytes());
+ digest.finalize().as_bytes().to_vec()
+ })),
})
}
@@ -142,6 +149,19 @@ impl DigestAlgorithm {
Self::Sha512 => digest_to_array!(Sha512, input_value),
Self::Blake2b => digest_to_array!(Blake2b, input_value),
Self::Blake2s => digest_to_array!(Blake2s, input_value),
+ Self::Blake3 => {
+ let binary_array: BinaryArray = input_value
+ .iter()
+ .map(|opt| {
+ opt.map(|x| {
+ let mut digest = Blake3::default();
+ digest.update(x.as_bytes());
+ digest.finalize().as_bytes().to_vec()
+ })
+ })
+ .collect();
+ Arc::new(binary_array)
+ }
};
Ok(ColumnarValue::Array(array))
}
@@ -164,11 +184,27 @@ impl FromStr for DigestAlgorithm {
"sha512" => Self::Sha512,
"blake2b" => Self::Blake2b,
"blake2s" => Self::Blake2s,
+ "blake3" => Self::Blake3,
_ => {
+ let options = [
+ Self::Md5,
+ Self::Sha224,
+ Self::Sha256,
+ Self::Sha384,
+ Self::Sha512,
+ Self::Blake2s,
+ Self::Blake2b,
+ Self::Blake3,
+ ]
+ .iter()
+ .map(|i| i.to_string())
+ .collect::<Vec<_>>()
+ .join(", ");
return Err(DataFusionError::Plan(format!(
- "There is no built-in digest algorithm named {}",
- name
- )))
+ "There is no built-in digest algorithm named '{}',
currently supported algorithms are: {}",
+ name,
+ options,
+ )));
}
})
}
@@ -271,6 +307,11 @@ define_digest_function!(
Blake2s,
"computes blake2s hash digest of the given input"
);
+define_digest_function!(
+ blake3,
+ Blake3,
+ "computes blake3 hash digest of the given input"
+);
/// Digest computes a binary hash of the given data, accepts Utf8 or LargeUtf8
and returns a [`ColumnarValue`].
/// Second argument is the algorithm to use.
diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs
index e822542..6c85f35 100644
--- a/datafusion/tests/sql.rs
+++ b/datafusion/tests/sql.rs
@@ -4079,6 +4079,10 @@ async fn test_crypto_expressions() -> Result<()> {
"digest('tom','blake2s')",
"5fc3f2b3a07cade5023c3df566e4d697d3823ba1b72bfb3e84cf7e768b2e7529"
);
+ test_expression!(
+ "digest('','blake3')",
+ "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
+ );
Ok(())
}