(datafusion-comet) branch main updated: chore: Use twox-hash 2.0 xxhash64 oneshot api instead of custom implementation (#1041)

agrove Wed, 30 Oct 2024 06:36:10 -0700

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git



The following commit(s) were added to refs/heads/main by this push:
     new 00cf79b2 chore: Use twox-hash 2.0 xxhash64 oneshot api instead of 
custom implementation (#1041)
00cf79b2 is described below

commit 00cf79b253d9fa31dcc0506facbaf7b2dac25cdd
Author: NoeB <[email protected]>
AuthorDate: Wed Oct 30 14:34:46 2024 +0100

    chore: Use twox-hash 2.0 xxhash64 oneshot api instead of custom 
implementation (#1041)
---
 LICENSE.txt                                        |  29 +---
 NOTICE.txt                                         |   3 -
 native/Cargo.lock                                  |  16 +-
 native/core/src/execution/sort.rs                  |   2 +-
 .../core/src/parquet/util/test_common/page_util.rs |   2 +-
 native/spark-expr/Cargo.toml                       |   3 +-
 native/spark-expr/benches/conditional.rs           |   4 +-
 native/spark-expr/src/cast.rs                      |   4 +-
 native/spark-expr/src/lib.rs                       |   1 -
 native/spark-expr/src/spark_hash.rs                |   6 +-
 native/spark-expr/src/xxhash64.rs                  | 190 ---------------------
 rust-toolchain.toml                                |   2 +-
 12 files changed, 26 insertions(+), 236 deletions(-)

diff --git a/LICENSE.txt b/LICENSE.txt
index 1afea9f4..920daaf7 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -210,31 +210,4 @@ This project includes code from Apache Aurora.
 
 Copyright: 2016 The Apache Software Foundation.
 Home page: https://aurora.apache.org/
-License: http://www.apache.org/licenses/LICENSE-2.0
-
---------------------------------------------------------------------------------
-
-This project includes software from the twox-hash project
-https://github.com/shepmaster/twox-hash
-
-The MIT License (MIT)
-
-Copyright (c) 2015 Jake Goulding
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
+License: http://www.apache.org/licenses/LICENSE-2.0
\ No newline at end of file
diff --git a/NOTICE.txt b/NOTICE.txt
index b3b1abce..b572b1fa 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -4,9 +4,6 @@ Copyright 2024 The Apache Software Foundation
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
 
-This product includes software from the twox-hash project (MIT License)
-https://github.com/shepmaster/twox-hash
-
 This product includes software developed at
 Apache Gluten (https://github.com/apache/incubator-gluten/)
 Specifically:
diff --git a/native/Cargo.lock b/native/Cargo.lock
index 86171890..39db7f9d 100644
--- a/native/Cargo.lock
+++ b/native/Cargo.lock
@@ -944,7 +944,7 @@ dependencies = [
  "rand",
  "regex",
  "thiserror",
- "twox-hash",
+ "twox-hash 2.0.0",
 ]
 
 [[package]]
@@ -1937,7 +1937,7 @@ version = "0.11.3"
 source = "registry+https://github.com/rust-lang/crates.io-index";
 checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
 dependencies = [
- "twox-hash",
+ "twox-hash 1.6.3",
 ]
 
 [[package]]
@@ -2184,7 +2184,7 @@ dependencies = [
  "paste",
  "seq-macro",
  "thrift",
- "twox-hash",
+ "twox-hash 1.6.3",
 ]
 
 [[package]]
@@ -2986,10 +2986,18 @@ source = 
"registry+https://github.com/rust-lang/crates.io-index";
 checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
 dependencies = [
  "cfg-if",
- "rand",
  "static_assertions",
 ]
 
+[[package]]
+name = "twox-hash"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index";
+checksum = "d09466de2fbca05ea830e16e62943f26607ab2148fb72b642505541711d99ad2"
+dependencies = [
+ "rand",
+]
+
 [[package]]
 name = "typemap-ors"
 version = "1.0.0"
diff --git a/native/core/src/execution/sort.rs 
b/native/core/src/execution/sort.rs
index b8687652..8579949c 100644
--- a/native/core/src/execution/sort.rs
+++ b/native/core/src/execution/sort.rs
@@ -19,7 +19,7 @@ use std::{cmp, mem, ptr};
 
 /// This is a copy of the `rdxsort-rs` crate, with the following changes:
 /// - removed `Rdx` implementations for all types except for i64 which is the 
packed representation
-/// of row addresses and partition ids from Spark.
+///   of row addresses and partition ids from Spark.
 
 pub trait Rdx {
     /// Sets the number of buckets used by the generic implementation.
diff --git a/native/core/src/parquet/util/test_common/page_util.rs 
b/native/core/src/parquet/util/test_common/page_util.rs
index 8fe5e65c..e20cc30c 100644
--- a/native/core/src/parquet/util/test_common/page_util.rs
+++ b/native/core/src/parquet/util/test_common/page_util.rs
@@ -48,7 +48,7 @@ pub trait DataPageBuilder {
 ///   - add_def_levels()
 ///   - add_values() for normal data page / add_indices() for dictionary data 
page
 ///   - consume()
-/// in order to populate and obtain a data page.
+///     in order to populate and obtain a data page.
 pub struct DataPageBuilderImpl {
     desc: ColumnDescPtr,
     encoding: Option<Encoding>,
diff --git a/native/spark-expr/Cargo.toml b/native/spark-expr/Cargo.toml
index a5d15691..532bf743 100644
--- a/native/spark-expr/Cargo.toml
+++ b/native/spark-expr/Cargo.toml
@@ -39,12 +39,13 @@ chrono-tz = { workspace = true }
 num = { workspace = true }
 regex = { workspace = true }
 thiserror = { workspace = true }
+twox-hash = "2.0.0"
 
 [dev-dependencies]
 arrow-data = {workspace = true}
 criterion = "0.5.1"
 rand = { workspace = true}
-twox-hash = "1.6.3"
+
 
 [lib]
 name = "datafusion_comet_spark_expr"
diff --git a/native/spark-expr/benches/conditional.rs 
b/native/spark-expr/benches/conditional.rs
index 444928d5..97cd9fcb 100644
--- a/native/spark-expr/benches/conditional.rs
+++ b/native/spark-expr/benches/conditional.rs
@@ -51,12 +51,12 @@ fn criterion_benchmark(c: &mut Criterion) {
         if i % 7 == 0 {
             c2.append_null();
         } else {
-            c2.append_value(&format!("string {i}"));
+            c2.append_value(format!("string {i}"));
         }
         if i % 9 == 0 {
             c3.append_null();
         } else {
-            c3.append_value(&format!("other string {i}"));
+            c3.append_value(format!("other string {i}"));
         }
     }
     let c1 = Arc::new(c1.finish());
diff --git a/native/spark-expr/src/cast.rs b/native/spark-expr/src/cast.rs
index 6a3974fe..0224aabf 100644
--- a/native/spark-expr/src/cast.rs
+++ b/native/spark-expr/src/cast.rs
@@ -1568,9 +1568,7 @@ fn get_timestamp_values<T: TimeZone>(
     timestamp_type: &str,
     tz: &T,
 ) -> SparkResult<Option<i64>> {
-    let values: Vec<_> = value
-        .split(|c| c == 'T' || c == '-' || c == ':' || c == '.')
-        .collect();
+    let values: Vec<_> = value.split(['T', '-', ':', '.']).collect();
     let year = values[0].parse::<i32>().unwrap_or_default();
     let month = values.get(1).map_or(1, |m| m.parse::<u32>().unwrap_or(1));
     let day = values.get(2).map_or(1, |d| d.parse::<u32>().unwrap_or(1));
diff --git a/native/spark-expr/src/lib.rs b/native/spark-expr/src/lib.rs
index cc22dfcb..614b48f2 100644
--- a/native/spark-expr/src/lib.rs
+++ b/native/spark-expr/src/lib.rs
@@ -33,7 +33,6 @@ mod temporal;
 pub mod timezone;
 mod to_json;
 pub mod utils;
-mod xxhash64;
 
 pub use cast::{spark_cast, Cast};
 pub use error::{SparkError, SparkResult};
diff --git a/native/spark-expr/src/spark_hash.rs 
b/native/spark-expr/src/spark_hash.rs
index 66a103a2..1402f717 100644
--- a/native/spark-expr/src/spark_hash.rs
+++ b/native/spark-expr/src/spark_hash.rs
@@ -22,6 +22,7 @@ use arrow::{
     datatypes::{ArrowNativeTypeOp, UInt16Type, UInt32Type, UInt64Type, 
UInt8Type},
 };
 use std::sync::Arc;
+use twox_hash::XxHash64;
 
 use datafusion::{
     arrow::{
@@ -34,7 +35,10 @@ use datafusion::{
     error::{DataFusionError, Result},
 };
 
-use crate::xxhash64::spark_compatible_xxhash64;
+#[inline]
+pub(crate) fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) -> 
u64 {
+    XxHash64::oneshot(seed, data.as_ref())
+}
 
 /// Spark-compatible murmur3 hash function
 #[inline]
diff --git a/native/spark-expr/src/xxhash64.rs 
b/native/spark-expr/src/xxhash64.rs
deleted file mode 100644
index f5a11f66..00000000
--- a/native/spark-expr/src/xxhash64.rs
+++ /dev/null
@@ -1,190 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! xxhash64 implementation
-
-const CHUNK_SIZE: usize = 32;
-
-const PRIME_1: u64 = 11_400_714_785_074_694_791;
-const PRIME_2: u64 = 14_029_467_366_897_019_727;
-const PRIME_3: u64 = 1_609_587_929_392_839_161;
-const PRIME_4: u64 = 9_650_029_242_287_828_579;
-const PRIME_5: u64 = 2_870_177_450_012_600_261;
-
-/// Custom implementation of xxhash64 based on code from 
https://github.com/shepmaster/twox-hash
-/// but optimized for our use case by removing any intermediate buffering, 
which is
-/// not required because we are operating on data that is already in memory.
-#[inline]
-pub(crate) fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) -> 
u64 {
-    let data: &[u8] = data.as_ref();
-    let length_bytes = data.len();
-
-    let mut v1 = seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2);
-    let mut v2 = seed.wrapping_add(PRIME_2);
-    let mut v3 = seed;
-    let mut v4 = seed.wrapping_sub(PRIME_1);
-
-    // process chunks of 32 bytes
-    let mut offset_u64_4 = 0;
-    let ptr_u64 = data.as_ptr() as *const u64;
-    unsafe {
-        while offset_u64_4 * CHUNK_SIZE + CHUNK_SIZE <= length_bytes {
-            v1 = ingest_one_number(v1, ptr_u64.add(offset_u64_4 * 
4).read_unaligned().to_le());
-            v2 = ingest_one_number(
-                v2,
-                ptr_u64.add(offset_u64_4 * 4 + 1).read_unaligned().to_le(),
-            );
-            v3 = ingest_one_number(
-                v3,
-                ptr_u64.add(offset_u64_4 * 4 + 2).read_unaligned().to_le(),
-            );
-            v4 = ingest_one_number(
-                v4,
-                ptr_u64.add(offset_u64_4 * 4 + 3).read_unaligned().to_le(),
-            );
-            offset_u64_4 += 1;
-        }
-    }
-
-    let mut hash = if length_bytes >= CHUNK_SIZE {
-        // We have processed at least one full chunk
-        let mut hash = v1.rotate_left(1);
-        hash = hash.wrapping_add(v2.rotate_left(7));
-        hash = hash.wrapping_add(v3.rotate_left(12));
-        hash = hash.wrapping_add(v4.rotate_left(18));
-
-        hash = mix_one(hash, v1);
-        hash = mix_one(hash, v2);
-        hash = mix_one(hash, v3);
-        hash = mix_one(hash, v4);
-
-        hash
-    } else {
-        seed.wrapping_add(PRIME_5)
-    };
-
-    hash = hash.wrapping_add(length_bytes as u64);
-
-    // process u64s
-    let mut offset_u64 = offset_u64_4 * 4;
-    while offset_u64 * 8 + 8 <= length_bytes {
-        let mut k1 = unsafe {
-            ptr_u64
-                .add(offset_u64)
-                .read_unaligned()
-                .to_le()
-                .wrapping_mul(PRIME_2)
-        };
-        k1 = k1.rotate_left(31);
-        k1 = k1.wrapping_mul(PRIME_1);
-        hash ^= k1;
-        hash = hash.rotate_left(27);
-        hash = hash.wrapping_mul(PRIME_1);
-        hash = hash.wrapping_add(PRIME_4);
-        offset_u64 += 1;
-    }
-
-    // process u32s
-    let data = &data[offset_u64 * 8..];
-    let ptr_u32 = data.as_ptr() as *const u32;
-    let length_bytes = length_bytes - offset_u64 * 8;
-    let mut offset_u32 = 0;
-    while offset_u32 * 4 + 4 <= length_bytes {
-        let k1 = unsafe {
-            
u64::from(ptr_u32.add(offset_u32).read_unaligned().to_le()).wrapping_mul(PRIME_1)
-        };
-        hash ^= k1;
-        hash = hash.rotate_left(23);
-        hash = hash.wrapping_mul(PRIME_2);
-        hash = hash.wrapping_add(PRIME_3);
-        offset_u32 += 1;
-    }
-
-    // process u8s
-    let data = &data[offset_u32 * 4..];
-    let length_bytes = length_bytes - offset_u32 * 4;
-    let mut offset_u8 = 0;
-    while offset_u8 < length_bytes {
-        let k1 = u64::from(data[offset_u8]).wrapping_mul(PRIME_5);
-        hash ^= k1;
-        hash = hash.rotate_left(11);
-        hash = hash.wrapping_mul(PRIME_1);
-        offset_u8 += 1;
-    }
-
-    // The final intermixing
-    hash ^= hash >> 33;
-    hash = hash.wrapping_mul(PRIME_2);
-    hash ^= hash >> 29;
-    hash = hash.wrapping_mul(PRIME_3);
-    hash ^= hash >> 32;
-
-    hash
-}
-
-#[inline(always)]
-fn ingest_one_number(mut current_value: u64, mut value: u64) -> u64 {
-    value = value.wrapping_mul(PRIME_2);
-    current_value = current_value.wrapping_add(value);
-    current_value = current_value.rotate_left(31);
-    current_value.wrapping_mul(PRIME_1)
-}
-
-#[inline(always)]
-fn mix_one(mut hash: u64, mut value: u64) -> u64 {
-    value = value.wrapping_mul(PRIME_2);
-    value = value.rotate_left(31);
-    value = value.wrapping_mul(PRIME_1);
-    hash ^= value;
-    hash = hash.wrapping_mul(PRIME_1);
-    hash.wrapping_add(PRIME_4)
-}
-
-#[cfg(test)]
-mod test {
-    use super::spark_compatible_xxhash64;
-    use rand::Rng;
-    use std::hash::Hasher;
-    use twox_hash::XxHash64;
-
-    #[test]
-    #[cfg_attr(miri, ignore)] // test takes too long with miri
-    fn test_xxhash64_random() {
-        let mut rng = rand::thread_rng();
-        for len in 0..128 {
-            for _ in 0..10 {
-                let data: Vec<u8> = (0..len).map(|_| rng.gen()).collect();
-                let seed = rng.gen();
-                check_xxhash64(&data, seed);
-            }
-        }
-    }
-
-    fn check_xxhash64(data: &[u8], seed: u64) {
-        let mut hasher = XxHash64::with_seed(seed);
-        hasher.write(data.as_ref());
-        let hash1 = hasher.finish();
-        let hash2 = spark_compatible_xxhash64(data, seed);
-        if hash1 != hash2 {
-            panic!("input: {} with seed {seed} produced incorrect hash 
(comet={hash2}, twox-hash={hash1})",
-                   data.iter().fold(String::new(), |mut output, byte| {
-                       output.push_str(&format!("{:02x}", byte));
-                       output
-                   }))
-        }
-    }
-}
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index fe9eee97..780fcd7f 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -16,5 +16,5 @@
 # under the License.
 
 [toolchain]
-channel = "1.79"
+channel = "1.81"
 components = ["rustfmt", "clippy", "rust-analyzer"]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion-comet) branch main updated: chore: Use twox-hash 2.0 xxhash64 oneshot api instead of custom implementation (#1041)

Reply via email to