This is an automated email from the ASF dual-hosted git repository.
tison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 9a83070 chore: check seed for CpcSketch and CountMinSketch (#85)
9a83070 is described below
commit 9a830707410a66823e997b04cfd6e48890904957
Author: tison <[email protected]>
AuthorDate: Wed Feb 11 09:31:10 2026 +0800
chore: check seed for CpcSketch and CountMinSketch (#85)
Signed-off-by: tison <[email protected]>
---
datasketches/src/bloom/builder.rs | 2 +-
datasketches/src/countmin/sketch.rs | 13 ++++++++++---
datasketches/src/cpc/sketch.rs | 9 ++++++---
datasketches/src/hash/mod.rs | 12 +++++++++---
4 files changed, 26 insertions(+), 10 deletions(-)
diff --git a/datasketches/src/bloom/builder.rs
b/datasketches/src/bloom/builder.rs
index 13e3b5a..3a66e26 100644
--- a/datasketches/src/bloom/builder.rs
+++ b/datasketches/src/bloom/builder.rs
@@ -87,7 +87,7 @@ impl BloomFilterBuilder {
///
/// # Panics
///
- /// Panics if:
+ /// Panics if any of:
/// - `num_bits` < MIN_NUM_BITS (64) or `num_bits` > MAX_NUM_BITS (~32 GB)
/// - `num_hashes` < 1 or `num_hashes` > 100
///
diff --git a/datasketches/src/countmin/sketch.rs
b/datasketches/src/countmin/sketch.rs
index 4727699..ad014a1 100644
--- a/datasketches/src/countmin/sketch.rs
+++ b/datasketches/src/countmin/sketch.rs
@@ -43,6 +43,7 @@ pub struct CountMinSketch<T: CountMinValue> {
num_hashes: u8,
num_buckets: u32,
seed: u64,
+ seed_hash: u16,
total_weight: T,
counts: Vec<T>,
hash_seeds: Vec<u64>,
@@ -71,8 +72,11 @@ impl<T: CountMinValue> CountMinSketch<T> {
///
/// # Panics
///
- /// Panics if `num_hashes` is 0, `num_buckets` is less than 3, or the
- /// total table size exceeds the supported limit.
+ /// Panics if any of:
+ /// - `num_hashes` is 0
+ /// - `num_buckets` is less than 3
+ /// - the total table size exceeds the supported limit
+ /// - the computed seed hash is zero
///
/// # Examples
///
@@ -281,7 +285,8 @@ impl<T: CountMinValue> CountMinSketch<T> {
bytes.write_u32_le(self.num_buckets);
bytes.write_u8(self.num_hashes);
- bytes.write_u16_le(compute_seed_hash(self.seed));
+ debug_assert_eq!(self.seed_hash, compute_seed_hash(self.seed));
+ bytes.write_u16_le(self.seed_hash);
bytes.write_u8(0);
if self.is_empty() {
@@ -391,11 +396,13 @@ impl<T: CountMinValue> CountMinSketch<T> {
fn make(num_hashes: u8, num_buckets: u32, seed: u64, entries: usize) ->
Self {
let counts = vec![T::ZERO; entries];
+ let seed_hash = compute_seed_hash(seed);
let hash_seeds = make_hash_seeds(seed, num_hashes);
CountMinSketch {
num_hashes,
num_buckets,
seed,
+ seed_hash,
total_weight: T::ZERO,
counts,
hash_seeds,
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index a1b771b..233c486 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -51,6 +51,7 @@ pub struct CpcSketch {
// immutable config variables
lg_k: u8,
seed: u64,
+ seed_hash: u16,
// sketch state
/// Part of a speed optimization.
@@ -97,7 +98,7 @@ impl CpcSketch {
///
/// # Panics
///
- /// Panics if `lg_k` is not in the range `[4, 16]`.
+ /// Panics if `lg_k` is not in the range `[4, 16]`, or the computed seed
hash is zero.
pub fn with_seed(lg_k: u8, seed: u64) -> Self {
assert!(
(MIN_LG_K..=MAX_LG_K).contains(&lg_k),
@@ -107,6 +108,7 @@ impl CpcSketch {
Self {
lg_k,
seed,
+ seed_hash: compute_seed_hash(seed),
first_interesting_column: 0,
num_coupons: 0,
surprising_value_table: None,
@@ -461,8 +463,8 @@ impl CpcSketch {
| (if has_table { 1 } else { 0 } << FLAG_HAS_TABLE)
| (if has_window { 1 } else { 0 } << FLAG_HAS_WINDOW);
bytes.write_u8(flags);
- let seed_hash = compute_seed_hash(self.seed);
- bytes.write_u16_le(seed_hash);
+ debug_assert_eq!(self.seed_hash, compute_seed_hash(self.seed));
+ bytes.write_u16_le(self.seed_hash);
if !self.is_empty() {
bytes.write_u32_le(self.num_coupons);
if has_table && has_window {
@@ -627,6 +629,7 @@ impl CpcSketch {
Ok(CpcSketch {
lg_k,
seed,
+ seed_hash,
first_interesting_column,
num_coupons,
surprising_value_table: Some(uncompressed.table),
diff --git a/datasketches/src/hash/mod.rs b/datasketches/src/hash/mod.rs
index 492e45e..87eaf22 100644
--- a/datasketches/src/hash/mod.rs
+++ b/datasketches/src/hash/mod.rs
@@ -39,15 +39,21 @@ pub(crate) const DEFAULT_UPDATE_SEED: u64 = 9001;
/// Computes and checks the 16-bit seed hash from the given long seed.
///
-/// The seed hash may not be zero in order to maintain compatibility with
older serialized
-/// versions that did not have this concept.
+/// The computed seed hash must not be zero in order to maintain compatibility
with older
+/// serialized versions that did not have this concept.
+///
+/// # Panics
+///
+/// Panics if the computed seed hash is zero.
pub(crate) fn compute_seed_hash(seed: u64) -> u16 {
use std::hash::Hasher;
let mut hasher = MurmurHash3X64128::with_seed(0);
hasher.write(&seed.to_le_bytes());
let (h1, _) = hasher.finish128();
- (h1 & 0xffff) as u16
+ let seed_hash = (h1 & 0xffff) as u16;
+ assert_ne!(seed_hash, 0);
+ seed_hash
}
/// Reads an u64 from a byte slice in little-endian order.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]