This is an automated email from the ASF dual-hosted git repository.

tison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git


The following commit(s) were added to refs/heads/main by this push:
     new 9a83070  chore: check seed for CpcSketch and CountMinSketch (#85)
9a83070 is described below

commit 9a830707410a66823e997b04cfd6e48890904957
Author: tison <[email protected]>
AuthorDate: Wed Feb 11 09:31:10 2026 +0800

    chore: check seed for CpcSketch and CountMinSketch (#85)
    
    Signed-off-by: tison <[email protected]>
---
 datasketches/src/bloom/builder.rs   |  2 +-
 datasketches/src/countmin/sketch.rs | 13 ++++++++++---
 datasketches/src/cpc/sketch.rs      |  9 ++++++---
 datasketches/src/hash/mod.rs        | 12 +++++++++---
 4 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/datasketches/src/bloom/builder.rs 
b/datasketches/src/bloom/builder.rs
index 13e3b5a..3a66e26 100644
--- a/datasketches/src/bloom/builder.rs
+++ b/datasketches/src/bloom/builder.rs
@@ -87,7 +87,7 @@ impl BloomFilterBuilder {
     ///
     /// # Panics
     ///
-    /// Panics if:
+    /// Panics if any of:
     /// - `num_bits` < MIN_NUM_BITS (64) or `num_bits` > MAX_NUM_BITS (~32 GB)
     /// - `num_hashes` < 1 or `num_hashes` > 100
     ///
diff --git a/datasketches/src/countmin/sketch.rs 
b/datasketches/src/countmin/sketch.rs
index 4727699..ad014a1 100644
--- a/datasketches/src/countmin/sketch.rs
+++ b/datasketches/src/countmin/sketch.rs
@@ -43,6 +43,7 @@ pub struct CountMinSketch<T: CountMinValue> {
     num_hashes: u8,
     num_buckets: u32,
     seed: u64,
+    seed_hash: u16,
     total_weight: T,
     counts: Vec<T>,
     hash_seeds: Vec<u64>,
@@ -71,8 +72,11 @@ impl<T: CountMinValue> CountMinSketch<T> {
     ///
     /// # Panics
     ///
-    /// Panics if `num_hashes` is 0, `num_buckets` is less than 3, or the
-    /// total table size exceeds the supported limit.
+    /// Panics if any of:
+    /// - `num_hashes` is 0
+    /// - `num_buckets` is less than 3
+    /// - the total table size exceeds the supported limit
+    /// - the computed seed hash is zero
     ///
     /// # Examples
     ///
@@ -281,7 +285,8 @@ impl<T: CountMinValue> CountMinSketch<T> {
 
         bytes.write_u32_le(self.num_buckets);
         bytes.write_u8(self.num_hashes);
-        bytes.write_u16_le(compute_seed_hash(self.seed));
+        debug_assert_eq!(self.seed_hash, compute_seed_hash(self.seed));
+        bytes.write_u16_le(self.seed_hash);
         bytes.write_u8(0);
 
         if self.is_empty() {
@@ -391,11 +396,13 @@ impl<T: CountMinValue> CountMinSketch<T> {
 
     fn make(num_hashes: u8, num_buckets: u32, seed: u64, entries: usize) -> 
Self {
         let counts = vec![T::ZERO; entries];
+        let seed_hash = compute_seed_hash(seed);
         let hash_seeds = make_hash_seeds(seed, num_hashes);
         CountMinSketch {
             num_hashes,
             num_buckets,
             seed,
+            seed_hash,
             total_weight: T::ZERO,
             counts,
             hash_seeds,
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index a1b771b..233c486 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -51,6 +51,7 @@ pub struct CpcSketch {
     // immutable config variables
     lg_k: u8,
     seed: u64,
+    seed_hash: u16,
 
     // sketch state
     /// Part of a speed optimization.
@@ -97,7 +98,7 @@ impl CpcSketch {
     ///
     /// # Panics
     ///
-    /// Panics if `lg_k` is not in the range `[4, 16]`.
+    /// Panics if `lg_k` is not in the range `[4, 16]`, or the computed seed 
hash is zero.
     pub fn with_seed(lg_k: u8, seed: u64) -> Self {
         assert!(
             (MIN_LG_K..=MAX_LG_K).contains(&lg_k),
@@ -107,6 +108,7 @@ impl CpcSketch {
         Self {
             lg_k,
             seed,
+            seed_hash: compute_seed_hash(seed),
             first_interesting_column: 0,
             num_coupons: 0,
             surprising_value_table: None,
@@ -461,8 +463,8 @@ impl CpcSketch {
             | (if has_table { 1 } else { 0 } << FLAG_HAS_TABLE)
             | (if has_window { 1 } else { 0 } << FLAG_HAS_WINDOW);
         bytes.write_u8(flags);
-        let seed_hash = compute_seed_hash(self.seed);
-        bytes.write_u16_le(seed_hash);
+        debug_assert_eq!(self.seed_hash, compute_seed_hash(self.seed));
+        bytes.write_u16_le(self.seed_hash);
         if !self.is_empty() {
             bytes.write_u32_le(self.num_coupons);
             if has_table && has_window {
@@ -627,6 +629,7 @@ impl CpcSketch {
         Ok(CpcSketch {
             lg_k,
             seed,
+            seed_hash,
             first_interesting_column,
             num_coupons,
             surprising_value_table: Some(uncompressed.table),
diff --git a/datasketches/src/hash/mod.rs b/datasketches/src/hash/mod.rs
index 492e45e..87eaf22 100644
--- a/datasketches/src/hash/mod.rs
+++ b/datasketches/src/hash/mod.rs
@@ -39,15 +39,21 @@ pub(crate) const DEFAULT_UPDATE_SEED: u64 = 9001;
 
 /// Computes and checks the 16-bit seed hash from the given long seed.
 ///
-/// The seed hash may not be zero in order to maintain compatibility with 
older serialized
-/// versions that did not have this concept.
+/// The computed seed hash must not be zero in order to maintain compatibility 
with older
+/// serialized versions that did not have this concept.
+///
+/// # Panics
+///
+/// Panics if the computed seed hash is zero.
 pub(crate) fn compute_seed_hash(seed: u64) -> u16 {
     use std::hash::Hasher;
 
     let mut hasher = MurmurHash3X64128::with_seed(0);
     hasher.write(&seed.to_le_bytes());
     let (h1, _) = hasher.finish128();
-    (h1 & 0xffff) as u16
+    let seed_hash = (h1 & 0xffff) as u16;
+    assert_ne!(seed_hash, 0);
+    seed_hash
 }
 
 /// Reads an u64 from a byte slice in little-endian order.


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to