This is an automated email from the ASF dual-hosted git repository.

frossi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git


The following commit(s) were added to refs/heads/main by this push:
     new 434fd57  fix(bloom): align builder parameters with java implementation 
(#93)
434fd57 is described below

commit 434fd5713db405d6eb2a6036df6a4931b115193e
Author: Filippo <[email protected]>
AuthorDate: Sat Feb 14 11:26:59 2026 +0100

    fix(bloom): align builder parameters with java implementation (#93)
    
    * fix(bloom): align MAX_NUM_BITS with java implementation
    * fix(bloom): align builder parameters to java limits
    * fix(bloom): make constants public as member of the builder
---
 datasketches/src/bloom/builder.rs | 78 ++++++++++++++++++++--------------
 datasketches/src/bloom/mod.rs     |  2 +-
 datasketches/src/bloom/sketch.rs  | 88 +++++++++++++++++++++------------------
 3 files changed, 96 insertions(+), 72 deletions(-)

diff --git a/datasketches/src/bloom/builder.rs 
b/datasketches/src/bloom/builder.rs
index 3a66e26..1918a13 100644
--- a/datasketches/src/bloom/builder.rs
+++ b/datasketches/src/bloom/builder.rs
@@ -16,17 +16,15 @@
 // under the License.
 
 use super::BloomFilter;
+use crate::codec::family::Family;
 use crate::hash::DEFAULT_UPDATE_SEED;
 
-const MIN_NUM_BITS: u64 = 64;
-const MAX_NUM_BITS: u64 = (1u64 << 35) - 64; // ~32 GB - reasonable limit
-
 /// Builder for creating [`BloomFilter`] instances.
 ///
 /// Provides two construction modes:
 /// - [`with_accuracy()`](Self::with_accuracy): Specify target items and false 
positive rate
 ///   (recommended)
-/// - [`with_size()`](Self::with_size): Specify exact bit count and hash 
functions (manual)
+/// - [`with_size()`](Self::with_size): Specify requested bit count and hash 
functions (manual)
 #[derive(Debug, Clone)]
 pub struct BloomFilterBuilder {
     num_bits: u64,
@@ -35,6 +33,18 @@ pub struct BloomFilterBuilder {
 }
 
 impl BloomFilterBuilder {
+    /// Minimum allowed requested Bloom filter size, in bits.
+    pub const MIN_NUM_BITS: u64 = 1;
+    /// Maximum allowed requested Bloom filter size, in bits.
+    ///
+    /// Derived from serialization limits so the encoded sketch length fits in 
a signed 32-bit size
+    /// field.
+    pub const MAX_NUM_BITS: u64 = (i32::MAX as u64 - 
Family::BLOOMFILTER.max_pre_longs as u64) * 64;
+    /// Minimum allowed number of hash functions.
+    pub const MIN_NUM_HASHES: u16 = 1;
+    /// Maximum allowed number of hash functions.
+    pub const MAX_NUM_HASHES: u16 = i16::MAX as u16;
+
     /// Creates a builder with optimal parameters for a target accuracy.
     ///
     /// Automatically calculates the optimal number of bits and hash functions
@@ -47,7 +57,7 @@ impl BloomFilterBuilder {
     ///
     /// # Panics
     ///
-    /// Panics if `max_items` is 0 or `fpp` is not in (0.0, 1.0).
+    /// Panics if `max_items` is 0 or `fpp` is not in (0.0, 1.0].
     ///
     /// # Examples
     ///
@@ -61,8 +71,8 @@ impl BloomFilterBuilder {
     pub fn with_accuracy(max_items: u64, fpp: f64) -> Self {
         assert!(max_items > 0, "max_items must be greater than 0");
         assert!(
-            fpp > 0.0 && fpp < 1.0,
-            "fpp must be between 0.0 and 1.0 (exclusive)"
+            fpp > 0.0 && fpp <= 1.0,
+            "fpp must be between 0.0 and 1.0 (inclusive of 1.0)"
         );
 
         let num_bits = Self::suggest_num_bits(max_items, fpp);
@@ -77,9 +87,12 @@ impl BloomFilterBuilder {
 
     /// Creates a builder with manual size specification.
     ///
-    /// Use this when you want precise control over the filter size,
+    /// Use this when you want precise control over the requested filter size,
     /// or when working with pre-calculated parameters.
     ///
+    /// The underlying storage is word-based, so the actual capacity is rounded
+    /// up to the next multiple of 64 bits.
+    ///
     /// # Arguments
     ///
     /// - `num_bits`: Total number of bits in the filter
@@ -88,8 +101,8 @@ impl BloomFilterBuilder {
     /// # Panics
     ///
     /// Panics if any of:
-    /// - `num_bits` < MIN_NUM_BITS (64) or `num_bits` > MAX_NUM_BITS (~32 GB)
-    /// - `num_hashes` < 1 or `num_hashes` > 100
+    /// - `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` > 
[`Self::MAX_NUM_BITS`]
+    /// - `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` > 
[`Self::MIN_NUM_HASHES`]
     ///
     /// # Examples
     ///
@@ -99,17 +112,19 @@ impl BloomFilterBuilder {
     /// ```
     pub fn with_size(num_bits: u64, num_hashes: u16) -> Self {
         assert!(
-            num_bits >= MIN_NUM_BITS,
-            "num_bits must be at least {}",
-            MIN_NUM_BITS
+            (Self::MIN_NUM_BITS..=Self::MAX_NUM_BITS).contains(&num_bits),
+            "num_bits must be between {} and {}, got {}",
+            Self::MIN_NUM_BITS,
+            Self::MAX_NUM_BITS,
+            num_bits,
         );
         assert!(
-            num_bits <= MAX_NUM_BITS,
-            "num_bits must not exceed {}",
-            MAX_NUM_BITS
+            
(Self::MIN_NUM_HASHES..=Self::MAX_NUM_HASHES).contains(&num_hashes),
+            "num_bits must be between {} and {}, got {}",
+            Self::MIN_NUM_HASHES,
+            Self::MAX_NUM_HASHES,
+            num_hashes
         );
-        assert!(num_hashes >= 1, "num_hashes must be at least 1");
-        assert!(num_hashes <= 100, "num_hashes must not exceed 100");
 
         BloomFilterBuilder {
             num_bits,
@@ -141,16 +156,13 @@ impl BloomFilterBuilder {
     ///
     /// Panics if neither `with_accuracy()` nor `with_size()` was called.
     pub fn build(self) -> BloomFilter {
-        let capacity_bits = self.num_bits;
         let num_hashes = self.num_hashes;
-
-        let num_words = capacity_bits.div_ceil(64) as usize;
-        let bit_array = vec![0u64; num_words];
+        let num_words = self.num_bits.div_ceil(64) as usize;
+        let bit_array = vec![0u64; num_words].into_boxed_slice();
 
         BloomFilter {
             seed: self.seed,
             num_hashes,
-            capacity_bits,
             num_bits_set: 0,
             bit_array,
         }
@@ -175,10 +187,7 @@ impl BloomFilterBuilder {
 
         let bits = (-n * p.ln() / ln2_squared).ceil() as u64;
 
-        // Round up to multiple of 64 for efficiency
-        let bits = bits.div_ceil(64) * 64;
-
-        bits.clamp(MIN_NUM_BITS, MAX_NUM_BITS)
+        bits.clamp(Self::MIN_NUM_BITS, Self::MAX_NUM_BITS)
     }
 
     /// Suggests optimal number of hash functions given max items and bit 
count.
@@ -197,9 +206,12 @@ impl BloomFilterBuilder {
         let m = num_bits as f64;
         let n = max_items as f64;
 
-        let k = (m / n * std::f64::consts::LN_2).round();
-
-        (k as u16).clamp(1, 100) // Reasonable bounds
+        // Ceil to avoid selecting too few hashes.
+        let k = (m / n * std::f64::consts::LN_2).ceil();
+        k.clamp(
+            f64::from(Self::MIN_NUM_HASHES),
+            f64::from(Self::MAX_NUM_HASHES),
+        ) as u16
     }
 
     /// Suggests optimal number of hash functions from target FPP.
@@ -215,7 +227,11 @@ impl BloomFilterBuilder {
     /// assert_eq!(hashes, 7); // -log2(0.01) ≈ 6.64
     /// ```
     pub fn suggest_num_hashes_from_fpp(fpp: f64) -> u16 {
+        // Ceil to avoid selecting too few hashes.
         let k = -fpp.log2();
-        (k.round() as u16).clamp(1, 100)
+        k.ceil().clamp(
+            f64::from(Self::MIN_NUM_HASHES),
+            f64::from(Self::MAX_NUM_HASHES),
+        ) as u16
     }
 }
diff --git a/datasketches/src/bloom/mod.rs b/datasketches/src/bloom/mod.rs
index 4a91287..e5ac69e 100644
--- a/datasketches/src/bloom/mod.rs
+++ b/datasketches/src/bloom/mod.rs
@@ -72,7 +72,7 @@
 //!
 //! ## By Size (Manual)
 //!
-//! Specify exact bit count and hash functions:
+//! Specify requested bit count and hash functions (rounded up to a multiple 
of 64 bits):
 //!
 //! ```rust
 //! # use datasketches::bloom::BloomFilterBuilder;
diff --git a/datasketches/src/bloom/sketch.rs b/datasketches/src/bloom/sketch.rs
index 638259f..1f20709 100644
--- a/datasketches/src/bloom/sketch.rs
+++ b/datasketches/src/bloom/sketch.rs
@@ -42,13 +42,10 @@ pub struct BloomFilter {
     pub(super) seed: u64,
     /// Number of hash functions to use (k)
     pub(super) num_hashes: u16,
-    /// Total number of bits in the filter (m)
-    pub(super) capacity_bits: u64,
     /// Count of bits set to 1 (for statistics)
     pub(super) num_bits_set: u64,
     /// Bit array packed into u64 words
-    /// Length = ceil(capacity_bits / 64)
-    pub(super) bit_array: Vec<u64>,
+    pub(super) bit_array: Box<[u64]>,
 }
 
 impl BloomFilter {
@@ -246,26 +243,10 @@ impl BloomFilter {
     /// // Now "apple" probably returns false, and most other items return true
     /// ```
     pub fn invert(&mut self) {
-        // Invert bits and count during operation (single pass)
-        let mut num_bits_set = 0;
         for word in &mut self.bit_array {
             *word = !*word;
-            num_bits_set += word.count_ones() as u64;
-        }
-
-        // Mask off excess bits in the last word
-        let excess_bits = self.capacity_bits % 64;
-        if excess_bits != 0 {
-            let last_idx = self.bit_array.len() - 1;
-            let old_count = self.bit_array[last_idx].count_ones() as u64;
-            let mask = (1u64 << excess_bits) - 1;
-            self.bit_array[last_idx] &= mask;
-            let new_count = self.bit_array[last_idx].count_ones() as u64;
-            // Adjust count for masked-off bits
-            num_bits_set = num_bits_set - old_count + new_count;
         }
-
-        self.num_bits_set = num_bits_set;
+        self.num_bits_set = self.capacity() as u64 - self.num_bits_set;
     }
 
     /// Returns whether the filter is empty (no items inserted).
@@ -281,8 +262,8 @@ impl BloomFilter {
     }
 
     /// Returns the total number of bits in the filter (capacity).
-    pub fn capacity(&self) -> u64 {
-        self.capacity_bits
+    pub fn capacity(&self) -> usize {
+        self.bit_array.len() * 64
     }
 
     /// Returns the number of hash functions used.
@@ -300,7 +281,7 @@ impl BloomFilter {
     /// Values near 0.5 indicate the filter is approaching saturation.
     /// Values above 0.5 indicate degraded false positive rates.
     pub fn load_factor(&self) -> f64 {
-        self.num_bits_set as f64 / self.capacity_bits as f64
+        self.num_bits_set as f64 / self.capacity() as f64
     }
 
     /// Estimates the current false positive probability.
@@ -327,8 +308,8 @@ impl BloomFilter {
     /// - Capacity (number of bits)
     /// - Number of hash functions
     /// - Seed
-    pub fn is_compatible(&self, other: &BloomFilter) -> bool {
-        self.capacity_bits == other.capacity_bits
+    pub fn is_compatible(&self, other: &Self) -> bool {
+        self.bit_array.len() == other.bit_array.len()
             && self.num_hashes == other.num_hashes
             && self.seed == other.seed
     }
@@ -374,8 +355,8 @@ impl BloomFilter {
 
         bytes.write_u64_le(self.seed);
 
-        // Bit array capacity stored as number of 64-bit words (int32) + 
unused padding (uint32)
-        let num_longs = (self.capacity_bits / 64) as i32;
+        // Bit array capacity is stored as number of 64-bit words (int32) + 
unused padding (uint32).
+        let num_longs = self.bit_array.len() as i32;
         bytes.write_i32_le(num_longs);
         bytes.write_u32_le(0); // unused
 
@@ -454,6 +435,13 @@ impl BloomFilter {
         let num_hashes = cursor
             .read_u16_le()
             .map_err(|_| Error::insufficient_data("num_hashes"))?;
+        if num_hashes == 0 || num_hashes > i16::MAX as u16 {
+            return Err(Error::deserial(format!(
+                "invalid num_hashes: expected [1, {}], got {}",
+                i16::MAX,
+                num_hashes
+            )));
+        }
         // Bytes 6-7: unused (u16)
         let _unused = cursor
             .read_u16_le()
@@ -462,17 +450,23 @@ impl BloomFilter {
             .read_u64_le()
             .map_err(|_| Error::insufficient_data("seed"))?;
 
-        // Bit array capacity stored as number of 64-bit words (int32) + 
unused padding (uint32)
+        // Bit array capacity is stored as number of 64-bit words (int32) + 
unused padding (uint32).
         let num_longs = cursor
             .read_i32_le()
-            .map_err(|_| Error::insufficient_data("num_longs"))? as u64;
+            .map_err(|_| Error::insufficient_data("num_longs"))?;
         let _unused = cursor
             .read_u32_le()
             .map_err(|_| Error::insufficient_data("unused"))?;
 
-        let capacity_bits = num_longs * 64; // Convert longs to bits
+        if num_longs <= 0 {
+            return Err(Error::deserial(format!(
+                "invalid num_longs: expected at least 1, got {}",
+                num_longs
+            )));
+        }
+
         let num_words = num_longs as usize;
-        let mut bit_array = vec![0u64; num_words];
+        let mut bit_array = vec![0u64; num_words].into_boxed_slice();
         let num_bits_set;
 
         if is_empty {
@@ -493,6 +487,14 @@ impl BloomFilter {
             if raw_num_bits_set == DIRTY_BITS_VALUE {
                 num_bits_set = bit_array.iter().map(|w| w.count_ones() as 
u64).sum();
             } else {
+                let raw_num_words_set = raw_num_bits_set.div_ceil(64) as usize;
+                if raw_num_words_set > num_words {
+                    return Err(Error::deserial(format!(
+                        "invalid num_bits_set: expected <= {}, got {}",
+                        num_words * 64,
+                        raw_num_bits_set
+                    )));
+                }
                 num_bits_set = raw_num_bits_set;
             }
         }
@@ -500,7 +502,6 @@ impl BloomFilter {
         Ok(BloomFilter {
             seed,
             num_hashes,
-            capacity_bits,
             num_bits_set,
             bit_array,
         })
@@ -552,22 +553,22 @@ impl BloomFilter {
     /// ```
     ///
     /// The right shift by 1 improves bit distribution. The index `i` is 
1-based.
-    fn compute_bit_index(&self, h0: u64, h1: u64, i: u16) -> u64 {
-        let hash = h0.wrapping_add(u64::from(i).wrapping_mul(h1));
-        (hash >> 1) % self.capacity_bits
+    fn compute_bit_index(&self, h0: u64, h1: u64, i: u16) -> usize {
+        let hash = h0.wrapping_add(u64::from(i).wrapping_mul(h1)) as usize;
+        (hash >> 1) % self.capacity()
     }
 
     /// Gets the value of a single bit.
-    fn get_bit(&self, bit_index: u64) -> bool {
-        let word_index = (bit_index >> 6) as usize; // Equivalent to bit_index 
/ 64
+    fn get_bit(&self, bit_index: usize) -> bool {
+        let word_index = bit_index >> 6; // Equivalent to bit_index / 64
         let bit_offset = bit_index & 63; // Equivalent to bit_index % 64
         let mask = 1u64 << bit_offset;
         (self.bit_array[word_index] & mask) != 0
     }
 
     /// Sets a single bit and updates the count if it wasn't already set.
-    fn set_bit(&mut self, bit_index: u64) {
-        let word_index = (bit_index >> 6) as usize; // Equivalent to bit_index 
/ 64
+    fn set_bit(&mut self, bit_index: usize) {
+        let word_index = bit_index >> 6; // Equivalent to bit_index / 64
         let bit_offset = bit_index & 63; // Equivalent to bit_index % 64
         let mask = 1u64 << bit_offset;
 
@@ -598,6 +599,13 @@ mod tests {
         assert_eq!(filter.num_hashes(), 5);
     }
 
+    #[test]
+    fn test_builder_with_size_rounds_to_word_boundary() {
+        let filter = BloomFilterBuilder::with_size(1, 3).build();
+        assert_eq!(filter.capacity(), 64);
+        assert_eq!(filter.num_hashes(), 3);
+    }
+
     #[test]
     fn test_insert_and_contains() {
         let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to