This is an automated email from the ASF dual-hosted git repository.
jiayuliu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new f38443d [python] add digest python function (#1127)
f38443d is described below
commit f38443d2338ea6953e3ce3178e937173bb53df94
Author: Jiayu Liu <[email protected]>
AuthorDate: Sun Oct 17 23:15:19 2021 +0800
[python] add digest python function (#1127)
* add digest python function
* add test result
* ignore long lines
---
.github/workflows/python_test.yaml | 2 +-
python/src/functions.rs | 13 +++++++
python/tests/test_string_functions.py | 72 +++++++++++++++++++++++++++++++++++
3 files changed, 86 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/python_test.yaml
b/.github/workflows/python_test.yaml
index 5419adb..17e7abb 100644
--- a/.github/workflows/python_test.yaml
+++ b/.github/workflows/python_test.yaml
@@ -49,7 +49,7 @@ jobs:
- name: Run Linters
run: |
source venv/bin/activate
- flake8 python
+ flake8 python --ignore=E501
black --line-length 79 --check python
- name: Run tests
run: |
diff --git a/python/src/functions.rs b/python/src/functions.rs
index 8611ca5..cecf28d 100644
--- a/python/src/functions.rs
+++ b/python/src/functions.rs
@@ -93,6 +93,18 @@ fn random() -> expression::Expression {
}
}
+/// Computes a binary hash of the given data. type is the algorithm to use.
+/// Standard algorithms are md5, sha224, sha256, sha384, sha512, blake2s,
blake2b, and blake3.
+#[pyfunction(value, method)]
+fn digest(
+ value: expression::Expression,
+ method: expression::Expression,
+) -> expression::Expression {
+ expression::Expression {
+ expr: logical_plan::digest(value.expr, method.expr),
+ }
+}
+
/// Concatenates the text representations of all the arguments.
/// NULL arguments are ignored.
#[pyfunction(args = "*")]
@@ -340,6 +352,7 @@ pub fn init(module: &PyModule) -> PyResult<()> {
module.add_function(wrap_pyfunction!(ltrim, module)?)?;
module.add_function(wrap_pyfunction!(max, module)?)?;
module.add_function(wrap_pyfunction!(md5, module)?)?;
+ module.add_function(wrap_pyfunction!(digest, module)?)?;
module.add_function(wrap_pyfunction!(min, module)?)?;
module.add_function(wrap_pyfunction!(now, module)?)?;
module.add_function(wrap_pyfunction!(octet_length, module)?)?;
diff --git a/python/tests/test_string_functions.py
b/python/tests/test_string_functions.py
index ea064a6..965f087 100644
--- a/python/tests/test_string_functions.py
+++ b/python/tests/test_string_functions.py
@@ -47,3 +47,75 @@ def test_string_functions(df):
]
)
assert result.column(1) == pa.array(["hello", "world", "!"])
+
+
+def test_hash_functions(df):
+ df = df.select(
+ *[
+ f.digest(f.col("a"), f.lit(m))
+ for m in ("md5", "sha256", "sha512", "blake2s", "blake3")
+ ]
+ )
+ result = df.collect()
+ assert len(result) == 1
+ result = result[0]
+ b = bytearray.fromhex
+ assert result.column(0) == pa.array(
+ [
+ b("8B1A9953C4611296A827ABF8C47804D7"),
+ b("F5A7924E621E84C9280A9A27E1BCB7F6"),
+ b("9033E0E305F247C0C3C80D0C7848C8B3"),
+ ]
+ )
+ assert result.column(1) == pa.array(
+ [
+ b(
+
"185F8DB32271FE25F561A6FC938B2E264306EC304EDA518007D1764826381969"
+ ),
+ b(
+
"78AE647DC5544D227130A0682A51E30BC7777FBB6D8A8F17007463A3ECD1D524"
+ ),
+ b(
+
"BB7208BC9B5D7C04F1236A82A0093A5E33F40423D5BA8D4266F7092C3BA43B62"
+ ),
+ ]
+ )
+ assert result.column(2) == pa.array(
+ [
+ b(
+
"3615F80C9D293ED7402687F94B22D58E529B8CC7916F8FAC7FDDF7FBD5AF4CF777D3D795A7A00A16BF7E7F3FB9561EE9BAAE480DA9FE7A18769E71886B03F315"
+ ),
+ b(
+
"8EA77393A42AB8FA92500FB077A9509CC32BC95E72712EFA116EDAF2EDFAE34FBB682EFDD6C5DD13C117E08BD4AAEF71291D8AACE2F890273081D0677C16DF0F"
+ ),
+ b(
+
"3831A6A6155E509DEE59A7F451EB35324D8F8F2DF6E3708894740F98FDEE23889F4DE5ADB0C5010DFB555CDA77C8AB5DC902094C52DE3278F35A75EBC25F093A"
+ ),
+ ]
+ )
+ assert result.column(3) == pa.array(
+ [
+ b(
+
"F73A5FBF881F89B814871F46E26AD3FA37CB2921C5E8561618639015B3CCBB71"
+ ),
+ b(
+
"B792A0383FB9E7A189EC150686579532854E44B71AC394831DAED169BA85CCC5"
+ ),
+ b(
+
"27988A0E51812297C77A433F635233346AEE29A829DCF4F46E0F58F402C6CFCB"
+ ),
+ ]
+ )
+ assert result.column(4) == pa.array(
+ [
+ b(
+
"FBC2B0516EE8744D293B980779178A3508850FDCFE965985782C39601B65794F"
+ ),
+ b(
+
"BF73D18575A736E4037D45F9E316085B86C19BE6363DE6AA789E13DEAACC1C4E"
+ ),
+ b(
+
"C8D11B9F7237E4034ADBCD2005735F9BC4C597C75AD89F4492BEC8F77D15F7EB"
+ ),
+ ]
+ )