This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new 00cf79b2 chore: Use twox-hash 2.0 xxhash64 oneshot api instead of
custom implementation (#1041)
00cf79b2 is described below
commit 00cf79b253d9fa31dcc0506facbaf7b2dac25cdd
Author: NoeB <[email protected]>
AuthorDate: Wed Oct 30 14:34:46 2024 +0100
chore: Use twox-hash 2.0 xxhash64 oneshot api instead of custom
implementation (#1041)
---
LICENSE.txt | 29 +---
NOTICE.txt | 3 -
native/Cargo.lock | 16 +-
native/core/src/execution/sort.rs | 2 +-
.../core/src/parquet/util/test_common/page_util.rs | 2 +-
native/spark-expr/Cargo.toml | 3 +-
native/spark-expr/benches/conditional.rs | 4 +-
native/spark-expr/src/cast.rs | 4 +-
native/spark-expr/src/lib.rs | 1 -
native/spark-expr/src/spark_hash.rs | 6 +-
native/spark-expr/src/xxhash64.rs | 190 ---------------------
rust-toolchain.toml | 2 +-
12 files changed, 26 insertions(+), 236 deletions(-)
diff --git a/LICENSE.txt b/LICENSE.txt
index 1afea9f4..920daaf7 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -210,31 +210,4 @@ This project includes code from Apache Aurora.
Copyright: 2016 The Apache Software Foundation.
Home page: https://aurora.apache.org/
-License: http://www.apache.org/licenses/LICENSE-2.0
-
---------------------------------------------------------------------------------
-
-This project includes software from the twox-hash project
-https://github.com/shepmaster/twox-hash
-
-The MIT License (MIT)
-
-Copyright (c) 2015 Jake Goulding
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
+License: http://www.apache.org/licenses/LICENSE-2.0
\ No newline at end of file
diff --git a/NOTICE.txt b/NOTICE.txt
index b3b1abce..b572b1fa 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -4,9 +4,6 @@ Copyright 2024 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
-This product includes software from the twox-hash project (MIT License)
-https://github.com/shepmaster/twox-hash
-
This product includes software developed at
Apache Gluten (https://github.com/apache/incubator-gluten/)
Specifically:
diff --git a/native/Cargo.lock b/native/Cargo.lock
index 86171890..39db7f9d 100644
--- a/native/Cargo.lock
+++ b/native/Cargo.lock
@@ -944,7 +944,7 @@ dependencies = [
"rand",
"regex",
"thiserror",
- "twox-hash",
+ "twox-hash 2.0.0",
]
[[package]]
@@ -1937,7 +1937,7 @@ version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
dependencies = [
- "twox-hash",
+ "twox-hash 1.6.3",
]
[[package]]
@@ -2184,7 +2184,7 @@ dependencies = [
"paste",
"seq-macro",
"thrift",
- "twox-hash",
+ "twox-hash 1.6.3",
]
[[package]]
@@ -2986,10 +2986,18 @@ source =
"registry+https://github.com/rust-lang/crates.io-index"
checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
dependencies = [
"cfg-if",
- "rand",
"static_assertions",
]
+[[package]]
+name = "twox-hash"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d09466de2fbca05ea830e16e62943f26607ab2148fb72b642505541711d99ad2"
+dependencies = [
+ "rand",
+]
+
[[package]]
name = "typemap-ors"
version = "1.0.0"
diff --git a/native/core/src/execution/sort.rs
b/native/core/src/execution/sort.rs
index b8687652..8579949c 100644
--- a/native/core/src/execution/sort.rs
+++ b/native/core/src/execution/sort.rs
@@ -19,7 +19,7 @@ use std::{cmp, mem, ptr};
/// This is a copy of the `rdxsort-rs` crate, with the following changes:
/// - removed `Rdx` implementations for all types except for i64 which is the
packed representation
-/// of row addresses and partition ids from Spark.
+/// of row addresses and partition ids from Spark.
pub trait Rdx {
/// Sets the number of buckets used by the generic implementation.
diff --git a/native/core/src/parquet/util/test_common/page_util.rs
b/native/core/src/parquet/util/test_common/page_util.rs
index 8fe5e65c..e20cc30c 100644
--- a/native/core/src/parquet/util/test_common/page_util.rs
+++ b/native/core/src/parquet/util/test_common/page_util.rs
@@ -48,7 +48,7 @@ pub trait DataPageBuilder {
/// - add_def_levels()
/// - add_values() for normal data page / add_indices() for dictionary data
page
/// - consume()
-/// in order to populate and obtain a data page.
+/// in order to populate and obtain a data page.
pub struct DataPageBuilderImpl {
desc: ColumnDescPtr,
encoding: Option<Encoding>,
diff --git a/native/spark-expr/Cargo.toml b/native/spark-expr/Cargo.toml
index a5d15691..532bf743 100644
--- a/native/spark-expr/Cargo.toml
+++ b/native/spark-expr/Cargo.toml
@@ -39,12 +39,13 @@ chrono-tz = { workspace = true }
num = { workspace = true }
regex = { workspace = true }
thiserror = { workspace = true }
+twox-hash = "2.0.0"
[dev-dependencies]
arrow-data = {workspace = true}
criterion = "0.5.1"
rand = { workspace = true}
-twox-hash = "1.6.3"
+
[lib]
name = "datafusion_comet_spark_expr"
diff --git a/native/spark-expr/benches/conditional.rs
b/native/spark-expr/benches/conditional.rs
index 444928d5..97cd9fcb 100644
--- a/native/spark-expr/benches/conditional.rs
+++ b/native/spark-expr/benches/conditional.rs
@@ -51,12 +51,12 @@ fn criterion_benchmark(c: &mut Criterion) {
if i % 7 == 0 {
c2.append_null();
} else {
- c2.append_value(&format!("string {i}"));
+ c2.append_value(format!("string {i}"));
}
if i % 9 == 0 {
c3.append_null();
} else {
- c3.append_value(&format!("other string {i}"));
+ c3.append_value(format!("other string {i}"));
}
}
let c1 = Arc::new(c1.finish());
diff --git a/native/spark-expr/src/cast.rs b/native/spark-expr/src/cast.rs
index 6a3974fe..0224aabf 100644
--- a/native/spark-expr/src/cast.rs
+++ b/native/spark-expr/src/cast.rs
@@ -1568,9 +1568,7 @@ fn get_timestamp_values<T: TimeZone>(
timestamp_type: &str,
tz: &T,
) -> SparkResult<Option<i64>> {
- let values: Vec<_> = value
- .split(|c| c == 'T' || c == '-' || c == ':' || c == '.')
- .collect();
+ let values: Vec<_> = value.split(['T', '-', ':', '.']).collect();
let year = values[0].parse::<i32>().unwrap_or_default();
let month = values.get(1).map_or(1, |m| m.parse::<u32>().unwrap_or(1));
let day = values.get(2).map_or(1, |d| d.parse::<u32>().unwrap_or(1));
diff --git a/native/spark-expr/src/lib.rs b/native/spark-expr/src/lib.rs
index cc22dfcb..614b48f2 100644
--- a/native/spark-expr/src/lib.rs
+++ b/native/spark-expr/src/lib.rs
@@ -33,7 +33,6 @@ mod temporal;
pub mod timezone;
mod to_json;
pub mod utils;
-mod xxhash64;
pub use cast::{spark_cast, Cast};
pub use error::{SparkError, SparkResult};
diff --git a/native/spark-expr/src/spark_hash.rs
b/native/spark-expr/src/spark_hash.rs
index 66a103a2..1402f717 100644
--- a/native/spark-expr/src/spark_hash.rs
+++ b/native/spark-expr/src/spark_hash.rs
@@ -22,6 +22,7 @@ use arrow::{
datatypes::{ArrowNativeTypeOp, UInt16Type, UInt32Type, UInt64Type,
UInt8Type},
};
use std::sync::Arc;
+use twox_hash::XxHash64;
use datafusion::{
arrow::{
@@ -34,7 +35,10 @@ use datafusion::{
error::{DataFusionError, Result},
};
-use crate::xxhash64::spark_compatible_xxhash64;
+#[inline]
+pub(crate) fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) ->
u64 {
+ XxHash64::oneshot(seed, data.as_ref())
+}
/// Spark-compatible murmur3 hash function
#[inline]
diff --git a/native/spark-expr/src/xxhash64.rs
b/native/spark-expr/src/xxhash64.rs
deleted file mode 100644
index f5a11f66..00000000
--- a/native/spark-expr/src/xxhash64.rs
+++ /dev/null
@@ -1,190 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! xxhash64 implementation
-
-const CHUNK_SIZE: usize = 32;
-
-const PRIME_1: u64 = 11_400_714_785_074_694_791;
-const PRIME_2: u64 = 14_029_467_366_897_019_727;
-const PRIME_3: u64 = 1_609_587_929_392_839_161;
-const PRIME_4: u64 = 9_650_029_242_287_828_579;
-const PRIME_5: u64 = 2_870_177_450_012_600_261;
-
-/// Custom implementation of xxhash64 based on code from
https://github.com/shepmaster/twox-hash
-/// but optimized for our use case by removing any intermediate buffering,
which is
-/// not required because we are operating on data that is already in memory.
-#[inline]
-pub(crate) fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) ->
u64 {
- let data: &[u8] = data.as_ref();
- let length_bytes = data.len();
-
- let mut v1 = seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2);
- let mut v2 = seed.wrapping_add(PRIME_2);
- let mut v3 = seed;
- let mut v4 = seed.wrapping_sub(PRIME_1);
-
- // process chunks of 32 bytes
- let mut offset_u64_4 = 0;
- let ptr_u64 = data.as_ptr() as *const u64;
- unsafe {
- while offset_u64_4 * CHUNK_SIZE + CHUNK_SIZE <= length_bytes {
- v1 = ingest_one_number(v1, ptr_u64.add(offset_u64_4 *
4).read_unaligned().to_le());
- v2 = ingest_one_number(
- v2,
- ptr_u64.add(offset_u64_4 * 4 + 1).read_unaligned().to_le(),
- );
- v3 = ingest_one_number(
- v3,
- ptr_u64.add(offset_u64_4 * 4 + 2).read_unaligned().to_le(),
- );
- v4 = ingest_one_number(
- v4,
- ptr_u64.add(offset_u64_4 * 4 + 3).read_unaligned().to_le(),
- );
- offset_u64_4 += 1;
- }
- }
-
- let mut hash = if length_bytes >= CHUNK_SIZE {
- // We have processed at least one full chunk
- let mut hash = v1.rotate_left(1);
- hash = hash.wrapping_add(v2.rotate_left(7));
- hash = hash.wrapping_add(v3.rotate_left(12));
- hash = hash.wrapping_add(v4.rotate_left(18));
-
- hash = mix_one(hash, v1);
- hash = mix_one(hash, v2);
- hash = mix_one(hash, v3);
- hash = mix_one(hash, v4);
-
- hash
- } else {
- seed.wrapping_add(PRIME_5)
- };
-
- hash = hash.wrapping_add(length_bytes as u64);
-
- // process u64s
- let mut offset_u64 = offset_u64_4 * 4;
- while offset_u64 * 8 + 8 <= length_bytes {
- let mut k1 = unsafe {
- ptr_u64
- .add(offset_u64)
- .read_unaligned()
- .to_le()
- .wrapping_mul(PRIME_2)
- };
- k1 = k1.rotate_left(31);
- k1 = k1.wrapping_mul(PRIME_1);
- hash ^= k1;
- hash = hash.rotate_left(27);
- hash = hash.wrapping_mul(PRIME_1);
- hash = hash.wrapping_add(PRIME_4);
- offset_u64 += 1;
- }
-
- // process u32s
- let data = &data[offset_u64 * 8..];
- let ptr_u32 = data.as_ptr() as *const u32;
- let length_bytes = length_bytes - offset_u64 * 8;
- let mut offset_u32 = 0;
- while offset_u32 * 4 + 4 <= length_bytes {
- let k1 = unsafe {
-
u64::from(ptr_u32.add(offset_u32).read_unaligned().to_le()).wrapping_mul(PRIME_1)
- };
- hash ^= k1;
- hash = hash.rotate_left(23);
- hash = hash.wrapping_mul(PRIME_2);
- hash = hash.wrapping_add(PRIME_3);
- offset_u32 += 1;
- }
-
- // process u8s
- let data = &data[offset_u32 * 4..];
- let length_bytes = length_bytes - offset_u32 * 4;
- let mut offset_u8 = 0;
- while offset_u8 < length_bytes {
- let k1 = u64::from(data[offset_u8]).wrapping_mul(PRIME_5);
- hash ^= k1;
- hash = hash.rotate_left(11);
- hash = hash.wrapping_mul(PRIME_1);
- offset_u8 += 1;
- }
-
- // The final intermixing
- hash ^= hash >> 33;
- hash = hash.wrapping_mul(PRIME_2);
- hash ^= hash >> 29;
- hash = hash.wrapping_mul(PRIME_3);
- hash ^= hash >> 32;
-
- hash
-}
-
-#[inline(always)]
-fn ingest_one_number(mut current_value: u64, mut value: u64) -> u64 {
- value = value.wrapping_mul(PRIME_2);
- current_value = current_value.wrapping_add(value);
- current_value = current_value.rotate_left(31);
- current_value.wrapping_mul(PRIME_1)
-}
-
-#[inline(always)]
-fn mix_one(mut hash: u64, mut value: u64) -> u64 {
- value = value.wrapping_mul(PRIME_2);
- value = value.rotate_left(31);
- value = value.wrapping_mul(PRIME_1);
- hash ^= value;
- hash = hash.wrapping_mul(PRIME_1);
- hash.wrapping_add(PRIME_4)
-}
-
-#[cfg(test)]
-mod test {
- use super::spark_compatible_xxhash64;
- use rand::Rng;
- use std::hash::Hasher;
- use twox_hash::XxHash64;
-
- #[test]
- #[cfg_attr(miri, ignore)] // test takes too long with miri
- fn test_xxhash64_random() {
- let mut rng = rand::thread_rng();
- for len in 0..128 {
- for _ in 0..10 {
- let data: Vec<u8> = (0..len).map(|_| rng.gen()).collect();
- let seed = rng.gen();
- check_xxhash64(&data, seed);
- }
- }
- }
-
- fn check_xxhash64(data: &[u8], seed: u64) {
- let mut hasher = XxHash64::with_seed(seed);
- hasher.write(data.as_ref());
- let hash1 = hasher.finish();
- let hash2 = spark_compatible_xxhash64(data, seed);
- if hash1 != hash2 {
- panic!("input: {} with seed {seed} produced incorrect hash
(comet={hash2}, twox-hash={hash1})",
- data.iter().fold(String::new(), |mut output, byte| {
- output.push_str(&format!("{:02x}", byte));
- output
- }))
- }
- }
-}
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index fe9eee97..780fcd7f 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -16,5 +16,5 @@
# under the License.
[toolchain]
-channel = "1.79"
+channel = "1.81"
components = ["rustfmt", "clippy", "rust-analyzer"]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]