This is an automated email from the ASF dual-hosted git repository.
leerho pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
The following commit(s) were added to refs/heads/main by this push:
new f22231e feat: add xxhash64 (#39)
f22231e is described below
commit f22231e28732a07e94807e3299d01ec6cd43a4d1
Author: Chojan Shang <[email protected]>
AuthorDate: Fri Dec 26 06:01:10 2025 +0800
feat: add xxhash64 (#39)
Signed-off-by: Chojan Shang <[email protected]>
---
datasketches/src/hash/mod.rs | 3 +
datasketches/src/hash/xxhash.rs | 250 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 253 insertions(+)
diff --git a/datasketches/src/hash/mod.rs b/datasketches/src/hash/mod.rs
index 7059dbb..ae9b280 100644
--- a/datasketches/src/hash/mod.rs
+++ b/datasketches/src/hash/mod.rs
@@ -16,5 +16,8 @@
// under the License.
mod murmurhash;
+mod xxhash;
pub use murmurhash::MurmurHash3X64128;
+#[allow(unused_imports)]
+pub use xxhash::XxHash64;
diff --git a/datasketches/src/hash/xxhash.rs b/datasketches/src/hash/xxhash.rs
new file mode 100644
index 0000000..d1f0eed
--- /dev/null
+++ b/datasketches/src/hash/xxhash.rs
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hash::Hasher;
+
+use byteorder::ByteOrder;
+use byteorder::LE;
+
+const DEFAULT_SEED: u64 = 0;
+
+// Unsigned 64-bit primes from xxhash64.
+const P1: u64 = 0x9E3779B185EBCA87;
+const P2: u64 = 0xC2B2AE3D27D4EB4F;
+const P3: u64 = 0x165667B19E3779F9;
+const P4: u64 = 0x85EBCA77C2B2AE63;
+const P5: u64 = 0x27D4EB2F165667C5;
+
+/// The XxHash64 is a fast, non-cryptographic, 64-bit hash function that has
+/// excellent avalanche and 2-way bit independence properties.
+#[derive(Debug)]
+pub struct XxHash64 {
+ seed: u64,
+ total_len: u64,
+ v1: u64,
+ v2: u64,
+ v3: u64,
+ v4: u64,
+ buffer: [u8; 32],
+ buffer_len: usize,
+}
+
+impl XxHash64 {
+ pub fn with_seed(seed: u64) -> Self {
+ XxHash64 {
+ seed,
+ total_len: 0,
+ v1: seed.wrapping_add(P1).wrapping_add(P2),
+ v2: seed.wrapping_add(P2),
+ v3: seed,
+ v4: seed.wrapping_sub(P1),
+ buffer: [0; 32],
+ buffer_len: 0,
+ }
+ }
+
+ pub fn finish64(&self) -> u64 {
+ let mut hash = if self.total_len >= 32 {
+ let mut acc = self
+ .v1
+ .rotate_left(1)
+ .wrapping_add(self.v2.rotate_left(7))
+ .wrapping_add(self.v3.rotate_left(12))
+ .wrapping_add(self.v4.rotate_left(18));
+ acc = merge_round(acc, self.v1);
+ acc = merge_round(acc, self.v2);
+ acc = merge_round(acc, self.v3);
+ acc = merge_round(acc, self.v4);
+ acc
+ } else {
+ self.seed.wrapping_add(P5)
+ };
+
+ hash = hash.wrapping_add(self.total_len);
+
+ let mut idx = 0;
+ let buf = &self.buffer[..self.buffer_len];
+ while idx + 8 <= buf.len() {
+ let mut k1 = LE::read_u64(&buf[idx..idx + 8]);
+ k1 = k1.wrapping_mul(P2);
+ k1 = k1.rotate_left(31);
+ k1 = k1.wrapping_mul(P1);
+ hash ^= k1;
+ hash = hash.rotate_left(27).wrapping_mul(P1).wrapping_add(P4);
+ idx += 8;
+ }
+
+ if idx + 4 <= buf.len() {
+ let k1 = LE::read_u32(&buf[idx..idx + 4]) as u64;
+ hash ^= k1.wrapping_mul(P1);
+ hash = hash.rotate_left(23).wrapping_mul(P2).wrapping_add(P3);
+ idx += 4;
+ }
+
+ while idx < buf.len() {
+ let k1 = buf[idx] as u64;
+ hash ^= k1.wrapping_mul(P5);
+ hash = hash.rotate_left(11).wrapping_mul(P1);
+ idx += 1;
+ }
+
+ finalize(hash)
+ }
+
+ #[allow(dead_code)]
+ pub fn hash_u64(input: u64, seed: u64) -> u64 {
+ let mut hash = seed.wrapping_add(P5).wrapping_add(8);
+ let mut k1 = input;
+ k1 = k1.wrapping_mul(P2);
+ k1 = k1.rotate_left(31);
+ k1 = k1.wrapping_mul(P1);
+ hash ^= k1;
+ hash = hash.rotate_left(27).wrapping_mul(P1).wrapping_add(P4);
+ finalize(hash)
+ }
+
+ #[inline]
+ fn update(&mut self, chunk: &[u8]) {
+ self.v1 = round(self.v1, LE::read_u64(&chunk[0..8]));
+ self.v2 = round(self.v2, LE::read_u64(&chunk[8..16]));
+ self.v3 = round(self.v3, LE::read_u64(&chunk[16..24]));
+ self.v4 = round(self.v4, LE::read_u64(&chunk[24..32]));
+ }
+}
+
+impl Default for XxHash64 {
+ fn default() -> Self {
+ Self::with_seed(DEFAULT_SEED)
+ }
+}
+
+impl Hasher for XxHash64 {
+ fn finish(&self) -> u64 {
+ self.finish64()
+ }
+
+ fn write(&mut self, bytes: &[u8]) {
+ self.total_len = self.total_len.wrapping_add(bytes.len() as u64);
+
+ if self.buffer_len + bytes.len() < 32 {
+ self.buffer[self.buffer_len..self.buffer_len +
bytes.len()].copy_from_slice(bytes);
+ self.buffer_len += bytes.len();
+ return;
+ }
+
+ let mut bytes = bytes;
+
+ if self.buffer_len != 0 {
+ let needed = 32 - self.buffer_len;
+ self.buffer[self.buffer_len..].copy_from_slice(&bytes[..needed]);
+ let chunk = self.buffer;
+ self.update(&chunk);
+ self.buffer_len = 0;
+ bytes = &bytes[needed..];
+ }
+
+ let mut chunks = bytes.chunks_exact(32);
+ for chunk in &mut chunks {
+ self.update(chunk);
+ }
+
+ let remainder = chunks.remainder();
+ if !remainder.is_empty() {
+ self.buffer[..remainder.len()].copy_from_slice(remainder);
+ self.buffer_len = remainder.len();
+ }
+ }
+}
+
+#[inline]
+fn round(mut acc: u64, input: u64) -> u64 {
+ acc = acc.wrapping_add(input.wrapping_mul(P2));
+ acc = acc.rotate_left(31);
+ acc.wrapping_mul(P1)
+}
+
+#[inline]
+fn merge_round(mut acc: u64, val: u64) -> u64 {
+ let mut v = val;
+ v = v.wrapping_mul(P2);
+ v = v.rotate_left(31);
+ v = v.wrapping_mul(P1);
+ acc ^= v;
+ acc.wrapping_mul(P1).wrapping_add(P4)
+}
+
+#[inline]
+fn finalize(mut hash: u64) -> u64 {
+ hash ^= hash >> 33;
+ hash = hash.wrapping_mul(P2);
+ hash ^= hash >> 29;
+ hash = hash.wrapping_mul(P3);
+ hash ^ (hash >> 32)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ const PRIME32: u64 = 0x9E3779B1;
+ const PRIME64: u64 = 0x9E3779B185EBCA8D;
+
+ fn fill_test_buffer(len: usize) -> Vec<u8> {
+ let mut buffer = vec![0u8; len];
+ let mut byte_gen = PRIME32;
+ for byte in &mut buffer {
+ *byte = (byte_gen >> 56) as u8;
+ byte_gen = byte_gen.wrapping_mul(PRIME64);
+ }
+ buffer
+ }
+
+ fn xxhash64(data: &[u8], seed: u64) -> u64 {
+ let mut hasher = XxHash64::with_seed(seed);
+ hasher.write(data);
+ hasher.finish64()
+ }
+
+ #[test]
+ fn test_vectors_seed_zero() {
+ let buf = fill_test_buffer(101);
+ assert_eq!(xxhash64(&buf[..0], 0), 0xEF46DB3751D8E999);
+ assert_eq!(xxhash64(&buf[..1], 0), 0xE934A84ADB052768);
+ assert_eq!(xxhash64(&buf[..32], 0), 0x18B216492BB44B70);
+ assert_eq!(xxhash64(&buf[..33], 0), 0x55C8DC3E578F5B59);
+ assert_eq!(xxhash64(&buf[..100], 0), 0x4BFE019CD91D9EA4);
+ }
+
+ #[test]
+ fn test_vectors_seed_prime32() {
+ let buf = fill_test_buffer(101);
+ assert_eq!(xxhash64(&buf[..0], PRIME32), 0xAC75FDA2929B17EF);
+ assert_eq!(xxhash64(&buf[..1], PRIME32), 0x5014607643A9B4C3);
+ assert_eq!(xxhash64(&buf[..32], PRIME32), 0xB3F33BDF93ADE409);
+ assert_eq!(xxhash64(&buf[..100], PRIME32), 0x4853706DC9625CAE);
+ }
+
+ #[test]
+ fn test_long_check() {
+ let seed = 0;
+ let hash1 = XxHash64::hash_u64(123, seed);
+ let mut hasher = XxHash64::with_seed(seed);
+ hasher.write(&123u64.to_le_bytes());
+ let hash2 = hasher.finish64();
+ assert_eq!(hash2, hash1);
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]