This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git


The following commit(s) were added to refs/heads/main by this push:
     new f22231e  feat: add xxhash64 (#39)
f22231e is described below

commit f22231e28732a07e94807e3299d01ec6cd43a4d1
Author: Chojan Shang <[email protected]>
AuthorDate: Fri Dec 26 06:01:10 2025 +0800

    feat: add xxhash64 (#39)
    
    Signed-off-by: Chojan Shang <[email protected]>
---
 datasketches/src/hash/mod.rs    |   3 +
 datasketches/src/hash/xxhash.rs | 250 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 253 insertions(+)

diff --git a/datasketches/src/hash/mod.rs b/datasketches/src/hash/mod.rs
index 7059dbb..ae9b280 100644
--- a/datasketches/src/hash/mod.rs
+++ b/datasketches/src/hash/mod.rs
@@ -16,5 +16,8 @@
 // under the License.
 
 mod murmurhash;
+mod xxhash;
 
 pub use murmurhash::MurmurHash3X64128;
+#[allow(unused_imports)]
+pub use xxhash::XxHash64;
diff --git a/datasketches/src/hash/xxhash.rs b/datasketches/src/hash/xxhash.rs
new file mode 100644
index 0000000..d1f0eed
--- /dev/null
+++ b/datasketches/src/hash/xxhash.rs
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hash::Hasher;
+
+use byteorder::ByteOrder;
+use byteorder::LE;
+
+const DEFAULT_SEED: u64 = 0;
+
+// Unsigned 64-bit primes from xxhash64.
+const P1: u64 = 0x9E3779B185EBCA87;
+const P2: u64 = 0xC2B2AE3D27D4EB4F;
+const P3: u64 = 0x165667B19E3779F9;
+const P4: u64 = 0x85EBCA77C2B2AE63;
+const P5: u64 = 0x27D4EB2F165667C5;
+
+/// The XxHash64 is a fast, non-cryptographic, 64-bit hash function that has
+/// excellent avalanche and 2-way bit independence properties.
+#[derive(Debug)]
+pub struct XxHash64 {
+    seed: u64,
+    total_len: u64,
+    v1: u64,
+    v2: u64,
+    v3: u64,
+    v4: u64,
+    buffer: [u8; 32],
+    buffer_len: usize,
+}
+
+impl XxHash64 {
+    pub fn with_seed(seed: u64) -> Self {
+        XxHash64 {
+            seed,
+            total_len: 0,
+            v1: seed.wrapping_add(P1).wrapping_add(P2),
+            v2: seed.wrapping_add(P2),
+            v3: seed,
+            v4: seed.wrapping_sub(P1),
+            buffer: [0; 32],
+            buffer_len: 0,
+        }
+    }
+
+    pub fn finish64(&self) -> u64 {
+        let mut hash = if self.total_len >= 32 {
+            let mut acc = self
+                .v1
+                .rotate_left(1)
+                .wrapping_add(self.v2.rotate_left(7))
+                .wrapping_add(self.v3.rotate_left(12))
+                .wrapping_add(self.v4.rotate_left(18));
+            acc = merge_round(acc, self.v1);
+            acc = merge_round(acc, self.v2);
+            acc = merge_round(acc, self.v3);
+            acc = merge_round(acc, self.v4);
+            acc
+        } else {
+            self.seed.wrapping_add(P5)
+        };
+
+        hash = hash.wrapping_add(self.total_len);
+
+        let mut idx = 0;
+        let buf = &self.buffer[..self.buffer_len];
+        while idx + 8 <= buf.len() {
+            let mut k1 = LE::read_u64(&buf[idx..idx + 8]);
+            k1 = k1.wrapping_mul(P2);
+            k1 = k1.rotate_left(31);
+            k1 = k1.wrapping_mul(P1);
+            hash ^= k1;
+            hash = hash.rotate_left(27).wrapping_mul(P1).wrapping_add(P4);
+            idx += 8;
+        }
+
+        if idx + 4 <= buf.len() {
+            let k1 = LE::read_u32(&buf[idx..idx + 4]) as u64;
+            hash ^= k1.wrapping_mul(P1);
+            hash = hash.rotate_left(23).wrapping_mul(P2).wrapping_add(P3);
+            idx += 4;
+        }
+
+        while idx < buf.len() {
+            let k1 = buf[idx] as u64;
+            hash ^= k1.wrapping_mul(P5);
+            hash = hash.rotate_left(11).wrapping_mul(P1);
+            idx += 1;
+        }
+
+        finalize(hash)
+    }
+
+    #[allow(dead_code)]
+    pub fn hash_u64(input: u64, seed: u64) -> u64 {
+        let mut hash = seed.wrapping_add(P5).wrapping_add(8);
+        let mut k1 = input;
+        k1 = k1.wrapping_mul(P2);
+        k1 = k1.rotate_left(31);
+        k1 = k1.wrapping_mul(P1);
+        hash ^= k1;
+        hash = hash.rotate_left(27).wrapping_mul(P1).wrapping_add(P4);
+        finalize(hash)
+    }
+
+    #[inline]
+    fn update(&mut self, chunk: &[u8]) {
+        self.v1 = round(self.v1, LE::read_u64(&chunk[0..8]));
+        self.v2 = round(self.v2, LE::read_u64(&chunk[8..16]));
+        self.v3 = round(self.v3, LE::read_u64(&chunk[16..24]));
+        self.v4 = round(self.v4, LE::read_u64(&chunk[24..32]));
+    }
+}
+
+impl Default for XxHash64 {
+    fn default() -> Self {
+        Self::with_seed(DEFAULT_SEED)
+    }
+}
+
+impl Hasher for XxHash64 {
+    fn finish(&self) -> u64 {
+        self.finish64()
+    }
+
+    fn write(&mut self, bytes: &[u8]) {
+        self.total_len = self.total_len.wrapping_add(bytes.len() as u64);
+
+        if self.buffer_len + bytes.len() < 32 {
+            self.buffer[self.buffer_len..self.buffer_len + 
bytes.len()].copy_from_slice(bytes);
+            self.buffer_len += bytes.len();
+            return;
+        }
+
+        let mut bytes = bytes;
+
+        if self.buffer_len != 0 {
+            let needed = 32 - self.buffer_len;
+            self.buffer[self.buffer_len..].copy_from_slice(&bytes[..needed]);
+            let chunk = self.buffer;
+            self.update(&chunk);
+            self.buffer_len = 0;
+            bytes = &bytes[needed..];
+        }
+
+        let mut chunks = bytes.chunks_exact(32);
+        for chunk in &mut chunks {
+            self.update(chunk);
+        }
+
+        let remainder = chunks.remainder();
+        if !remainder.is_empty() {
+            self.buffer[..remainder.len()].copy_from_slice(remainder);
+            self.buffer_len = remainder.len();
+        }
+    }
+}
+
+#[inline]
+fn round(mut acc: u64, input: u64) -> u64 {
+    acc = acc.wrapping_add(input.wrapping_mul(P2));
+    acc = acc.rotate_left(31);
+    acc.wrapping_mul(P1)
+}
+
+#[inline]
+fn merge_round(mut acc: u64, val: u64) -> u64 {
+    let mut v = val;
+    v = v.wrapping_mul(P2);
+    v = v.rotate_left(31);
+    v = v.wrapping_mul(P1);
+    acc ^= v;
+    acc.wrapping_mul(P1).wrapping_add(P4)
+}
+
+#[inline]
+fn finalize(mut hash: u64) -> u64 {
+    hash ^= hash >> 33;
+    hash = hash.wrapping_mul(P2);
+    hash ^= hash >> 29;
+    hash = hash.wrapping_mul(P3);
+    hash ^ (hash >> 32)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const PRIME32: u64 = 0x9E3779B1;
+    const PRIME64: u64 = 0x9E3779B185EBCA8D;
+
+    fn fill_test_buffer(len: usize) -> Vec<u8> {
+        let mut buffer = vec![0u8; len];
+        let mut byte_gen = PRIME32;
+        for byte in &mut buffer {
+            *byte = (byte_gen >> 56) as u8;
+            byte_gen = byte_gen.wrapping_mul(PRIME64);
+        }
+        buffer
+    }
+
+    fn xxhash64(data: &[u8], seed: u64) -> u64 {
+        let mut hasher = XxHash64::with_seed(seed);
+        hasher.write(data);
+        hasher.finish64()
+    }
+
+    #[test]
+    fn test_vectors_seed_zero() {
+        let buf = fill_test_buffer(101);
+        assert_eq!(xxhash64(&buf[..0], 0), 0xEF46DB3751D8E999);
+        assert_eq!(xxhash64(&buf[..1], 0), 0xE934A84ADB052768);
+        assert_eq!(xxhash64(&buf[..32], 0), 0x18B216492BB44B70);
+        assert_eq!(xxhash64(&buf[..33], 0), 0x55C8DC3E578F5B59);
+        assert_eq!(xxhash64(&buf[..100], 0), 0x4BFE019CD91D9EA4);
+    }
+
+    #[test]
+    fn test_vectors_seed_prime32() {
+        let buf = fill_test_buffer(101);
+        assert_eq!(xxhash64(&buf[..0], PRIME32), 0xAC75FDA2929B17EF);
+        assert_eq!(xxhash64(&buf[..1], PRIME32), 0x5014607643A9B4C3);
+        assert_eq!(xxhash64(&buf[..32], PRIME32), 0xB3F33BDF93ADE409);
+        assert_eq!(xxhash64(&buf[..100], PRIME32), 0x4853706DC9625CAE);
+    }
+
+    #[test]
+    fn test_long_check() {
+        let seed = 0;
+        let hash1 = XxHash64::hash_u64(123, seed);
+        let mut hasher = XxHash64::with_seed(seed);
+        hasher.write(&123u64.to_le_bytes());
+        let hash2 = hasher.finish64();
+        assert_eq!(hash2, hash1);
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to