This is an automated email from the ASF dual-hosted git repository.
frossi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 434fd57 fix(bloom): align builder parameters with java implementation
(#93)
434fd57 is described below
commit 434fd5713db405d6eb2a6036df6a4931b115193e
Author: Filippo <[email protected]>
AuthorDate: Sat Feb 14 11:26:59 2026 +0100
fix(bloom): align builder parameters with java implementation (#93)
* fix(bloom): align MAX_NUM_BITS with java implementation
* fix(bloom): align builder parameters to java limits
* fix(bloom): make constants public as member of the builder
---
datasketches/src/bloom/builder.rs | 78 ++++++++++++++++++++--------------
datasketches/src/bloom/mod.rs | 2 +-
datasketches/src/bloom/sketch.rs | 88 +++++++++++++++++++++------------------
3 files changed, 96 insertions(+), 72 deletions(-)
diff --git a/datasketches/src/bloom/builder.rs
b/datasketches/src/bloom/builder.rs
index 3a66e26..1918a13 100644
--- a/datasketches/src/bloom/builder.rs
+++ b/datasketches/src/bloom/builder.rs
@@ -16,17 +16,15 @@
// under the License.
use super::BloomFilter;
+use crate::codec::family::Family;
use crate::hash::DEFAULT_UPDATE_SEED;
-const MIN_NUM_BITS: u64 = 64;
-const MAX_NUM_BITS: u64 = (1u64 << 35) - 64; // ~32 GB - reasonable limit
-
/// Builder for creating [`BloomFilter`] instances.
///
/// Provides two construction modes:
/// - [`with_accuracy()`](Self::with_accuracy): Specify target items and false
positive rate
/// (recommended)
-/// - [`with_size()`](Self::with_size): Specify exact bit count and hash
functions (manual)
+/// - [`with_size()`](Self::with_size): Specify requested bit count and hash
functions (manual)
#[derive(Debug, Clone)]
pub struct BloomFilterBuilder {
num_bits: u64,
@@ -35,6 +33,18 @@ pub struct BloomFilterBuilder {
}
impl BloomFilterBuilder {
+ /// Minimum allowed requested Bloom filter size, in bits.
+ pub const MIN_NUM_BITS: u64 = 1;
+ /// Maximum allowed requested Bloom filter size, in bits.
+ ///
+ /// Derived from serialization limits so the encoded sketch length fits in
a signed 32-bit size
+ /// field.
+ pub const MAX_NUM_BITS: u64 = (i32::MAX as u64 -
Family::BLOOMFILTER.max_pre_longs as u64) * 64;
+ /// Minimum allowed number of hash functions.
+ pub const MIN_NUM_HASHES: u16 = 1;
+ /// Maximum allowed number of hash functions.
+ pub const MAX_NUM_HASHES: u16 = i16::MAX as u16;
+
/// Creates a builder with optimal parameters for a target accuracy.
///
/// Automatically calculates the optimal number of bits and hash functions
@@ -47,7 +57,7 @@ impl BloomFilterBuilder {
///
/// # Panics
///
- /// Panics if `max_items` is 0 or `fpp` is not in (0.0, 1.0).
+ /// Panics if `max_items` is 0 or `fpp` is not in (0.0, 1.0].
///
/// # Examples
///
@@ -61,8 +71,8 @@ impl BloomFilterBuilder {
pub fn with_accuracy(max_items: u64, fpp: f64) -> Self {
assert!(max_items > 0, "max_items must be greater than 0");
assert!(
- fpp > 0.0 && fpp < 1.0,
- "fpp must be between 0.0 and 1.0 (exclusive)"
+ fpp > 0.0 && fpp <= 1.0,
+ "fpp must be between 0.0 and 1.0 (inclusive of 1.0)"
);
let num_bits = Self::suggest_num_bits(max_items, fpp);
@@ -77,9 +87,12 @@ impl BloomFilterBuilder {
/// Creates a builder with manual size specification.
///
- /// Use this when you want precise control over the filter size,
+ /// Use this when you want precise control over the requested filter size,
/// or when working with pre-calculated parameters.
///
+ /// The underlying storage is word-based, so the actual capacity is rounded
+ /// up to the next multiple of 64 bits.
+ ///
/// # Arguments
///
/// - `num_bits`: Total number of bits in the filter
@@ -88,8 +101,8 @@ impl BloomFilterBuilder {
/// # Panics
///
/// Panics if any of:
- /// - `num_bits` < MIN_NUM_BITS (64) or `num_bits` > MAX_NUM_BITS (~32 GB)
- /// - `num_hashes` < 1 or `num_hashes` > 100
+ /// - `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` >
[`Self::MAX_NUM_BITS`]
+ /// - `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` >
[`Self::MIN_NUM_HASHES`]
///
/// # Examples
///
@@ -99,17 +112,19 @@ impl BloomFilterBuilder {
/// ```
pub fn with_size(num_bits: u64, num_hashes: u16) -> Self {
assert!(
- num_bits >= MIN_NUM_BITS,
- "num_bits must be at least {}",
- MIN_NUM_BITS
+ (Self::MIN_NUM_BITS..=Self::MAX_NUM_BITS).contains(&num_bits),
+ "num_bits must be between {} and {}, got {}",
+ Self::MIN_NUM_BITS,
+ Self::MAX_NUM_BITS,
+ num_bits,
);
assert!(
- num_bits <= MAX_NUM_BITS,
- "num_bits must not exceed {}",
- MAX_NUM_BITS
+
(Self::MIN_NUM_HASHES..=Self::MAX_NUM_HASHES).contains(&num_hashes),
+ "num_bits must be between {} and {}, got {}",
+ Self::MIN_NUM_HASHES,
+ Self::MAX_NUM_HASHES,
+ num_hashes
);
- assert!(num_hashes >= 1, "num_hashes must be at least 1");
- assert!(num_hashes <= 100, "num_hashes must not exceed 100");
BloomFilterBuilder {
num_bits,
@@ -141,16 +156,13 @@ impl BloomFilterBuilder {
///
/// Panics if neither `with_accuracy()` nor `with_size()` was called.
pub fn build(self) -> BloomFilter {
- let capacity_bits = self.num_bits;
let num_hashes = self.num_hashes;
-
- let num_words = capacity_bits.div_ceil(64) as usize;
- let bit_array = vec![0u64; num_words];
+ let num_words = self.num_bits.div_ceil(64) as usize;
+ let bit_array = vec![0u64; num_words].into_boxed_slice();
BloomFilter {
seed: self.seed,
num_hashes,
- capacity_bits,
num_bits_set: 0,
bit_array,
}
@@ -175,10 +187,7 @@ impl BloomFilterBuilder {
let bits = (-n * p.ln() / ln2_squared).ceil() as u64;
- // Round up to multiple of 64 for efficiency
- let bits = bits.div_ceil(64) * 64;
-
- bits.clamp(MIN_NUM_BITS, MAX_NUM_BITS)
+ bits.clamp(Self::MIN_NUM_BITS, Self::MAX_NUM_BITS)
}
/// Suggests optimal number of hash functions given max items and bit
count.
@@ -197,9 +206,12 @@ impl BloomFilterBuilder {
let m = num_bits as f64;
let n = max_items as f64;
- let k = (m / n * std::f64::consts::LN_2).round();
-
- (k as u16).clamp(1, 100) // Reasonable bounds
+ // Ceil to avoid selecting too few hashes.
+ let k = (m / n * std::f64::consts::LN_2).ceil();
+ k.clamp(
+ f64::from(Self::MIN_NUM_HASHES),
+ f64::from(Self::MAX_NUM_HASHES),
+ ) as u16
}
/// Suggests optimal number of hash functions from target FPP.
@@ -215,7 +227,11 @@ impl BloomFilterBuilder {
/// assert_eq!(hashes, 7); // -log2(0.01) ≈ 6.64
/// ```
pub fn suggest_num_hashes_from_fpp(fpp: f64) -> u16 {
+ // Ceil to avoid selecting too few hashes.
let k = -fpp.log2();
- (k.round() as u16).clamp(1, 100)
+ k.ceil().clamp(
+ f64::from(Self::MIN_NUM_HASHES),
+ f64::from(Self::MAX_NUM_HASHES),
+ ) as u16
}
}
diff --git a/datasketches/src/bloom/mod.rs b/datasketches/src/bloom/mod.rs
index 4a91287..e5ac69e 100644
--- a/datasketches/src/bloom/mod.rs
+++ b/datasketches/src/bloom/mod.rs
@@ -72,7 +72,7 @@
//!
//! ## By Size (Manual)
//!
-//! Specify exact bit count and hash functions:
+//! Specify requested bit count and hash functions (rounded up to a multiple
of 64 bits):
//!
//! ```rust
//! # use datasketches::bloom::BloomFilterBuilder;
diff --git a/datasketches/src/bloom/sketch.rs b/datasketches/src/bloom/sketch.rs
index 638259f..1f20709 100644
--- a/datasketches/src/bloom/sketch.rs
+++ b/datasketches/src/bloom/sketch.rs
@@ -42,13 +42,10 @@ pub struct BloomFilter {
pub(super) seed: u64,
/// Number of hash functions to use (k)
pub(super) num_hashes: u16,
- /// Total number of bits in the filter (m)
- pub(super) capacity_bits: u64,
/// Count of bits set to 1 (for statistics)
pub(super) num_bits_set: u64,
/// Bit array packed into u64 words
- /// Length = ceil(capacity_bits / 64)
- pub(super) bit_array: Vec<u64>,
+ pub(super) bit_array: Box<[u64]>,
}
impl BloomFilter {
@@ -246,26 +243,10 @@ impl BloomFilter {
/// // Now "apple" probably returns false, and most other items return true
/// ```
pub fn invert(&mut self) {
- // Invert bits and count during operation (single pass)
- let mut num_bits_set = 0;
for word in &mut self.bit_array {
*word = !*word;
- num_bits_set += word.count_ones() as u64;
- }
-
- // Mask off excess bits in the last word
- let excess_bits = self.capacity_bits % 64;
- if excess_bits != 0 {
- let last_idx = self.bit_array.len() - 1;
- let old_count = self.bit_array[last_idx].count_ones() as u64;
- let mask = (1u64 << excess_bits) - 1;
- self.bit_array[last_idx] &= mask;
- let new_count = self.bit_array[last_idx].count_ones() as u64;
- // Adjust count for masked-off bits
- num_bits_set = num_bits_set - old_count + new_count;
}
-
- self.num_bits_set = num_bits_set;
+ self.num_bits_set = self.capacity() as u64 - self.num_bits_set;
}
/// Returns whether the filter is empty (no items inserted).
@@ -281,8 +262,8 @@ impl BloomFilter {
}
/// Returns the total number of bits in the filter (capacity).
- pub fn capacity(&self) -> u64 {
- self.capacity_bits
+ pub fn capacity(&self) -> usize {
+ self.bit_array.len() * 64
}
/// Returns the number of hash functions used.
@@ -300,7 +281,7 @@ impl BloomFilter {
/// Values near 0.5 indicate the filter is approaching saturation.
/// Values above 0.5 indicate degraded false positive rates.
pub fn load_factor(&self) -> f64 {
- self.num_bits_set as f64 / self.capacity_bits as f64
+ self.num_bits_set as f64 / self.capacity() as f64
}
/// Estimates the current false positive probability.
@@ -327,8 +308,8 @@ impl BloomFilter {
/// - Capacity (number of bits)
/// - Number of hash functions
/// - Seed
- pub fn is_compatible(&self, other: &BloomFilter) -> bool {
- self.capacity_bits == other.capacity_bits
+ pub fn is_compatible(&self, other: &Self) -> bool {
+ self.bit_array.len() == other.bit_array.len()
&& self.num_hashes == other.num_hashes
&& self.seed == other.seed
}
@@ -374,8 +355,8 @@ impl BloomFilter {
bytes.write_u64_le(self.seed);
- // Bit array capacity stored as number of 64-bit words (int32) +
unused padding (uint32)
- let num_longs = (self.capacity_bits / 64) as i32;
+ // Bit array capacity is stored as number of 64-bit words (int32) +
unused padding (uint32).
+ let num_longs = self.bit_array.len() as i32;
bytes.write_i32_le(num_longs);
bytes.write_u32_le(0); // unused
@@ -454,6 +435,13 @@ impl BloomFilter {
let num_hashes = cursor
.read_u16_le()
.map_err(|_| Error::insufficient_data("num_hashes"))?;
+ if num_hashes == 0 || num_hashes > i16::MAX as u16 {
+ return Err(Error::deserial(format!(
+ "invalid num_hashes: expected [1, {}], got {}",
+ i16::MAX,
+ num_hashes
+ )));
+ }
// Bytes 6-7: unused (u16)
let _unused = cursor
.read_u16_le()
@@ -462,17 +450,23 @@ impl BloomFilter {
.read_u64_le()
.map_err(|_| Error::insufficient_data("seed"))?;
- // Bit array capacity stored as number of 64-bit words (int32) +
unused padding (uint32)
+ // Bit array capacity is stored as number of 64-bit words (int32) +
unused padding (uint32).
let num_longs = cursor
.read_i32_le()
- .map_err(|_| Error::insufficient_data("num_longs"))? as u64;
+ .map_err(|_| Error::insufficient_data("num_longs"))?;
let _unused = cursor
.read_u32_le()
.map_err(|_| Error::insufficient_data("unused"))?;
- let capacity_bits = num_longs * 64; // Convert longs to bits
+ if num_longs <= 0 {
+ return Err(Error::deserial(format!(
+ "invalid num_longs: expected at least 1, got {}",
+ num_longs
+ )));
+ }
+
let num_words = num_longs as usize;
- let mut bit_array = vec![0u64; num_words];
+ let mut bit_array = vec![0u64; num_words].into_boxed_slice();
let num_bits_set;
if is_empty {
@@ -493,6 +487,14 @@ impl BloomFilter {
if raw_num_bits_set == DIRTY_BITS_VALUE {
num_bits_set = bit_array.iter().map(|w| w.count_ones() as
u64).sum();
} else {
+ let raw_num_words_set = raw_num_bits_set.div_ceil(64) as usize;
+ if raw_num_words_set > num_words {
+ return Err(Error::deserial(format!(
+ "invalid num_bits_set: expected <= {}, got {}",
+ num_words * 64,
+ raw_num_bits_set
+ )));
+ }
num_bits_set = raw_num_bits_set;
}
}
@@ -500,7 +502,6 @@ impl BloomFilter {
Ok(BloomFilter {
seed,
num_hashes,
- capacity_bits,
num_bits_set,
bit_array,
})
@@ -552,22 +553,22 @@ impl BloomFilter {
/// ```
///
/// The right shift by 1 improves bit distribution. The index `i` is
1-based.
- fn compute_bit_index(&self, h0: u64, h1: u64, i: u16) -> u64 {
- let hash = h0.wrapping_add(u64::from(i).wrapping_mul(h1));
- (hash >> 1) % self.capacity_bits
+ fn compute_bit_index(&self, h0: u64, h1: u64, i: u16) -> usize {
+ let hash = h0.wrapping_add(u64::from(i).wrapping_mul(h1)) as usize;
+ (hash >> 1) % self.capacity()
}
/// Gets the value of a single bit.
- fn get_bit(&self, bit_index: u64) -> bool {
- let word_index = (bit_index >> 6) as usize; // Equivalent to bit_index
/ 64
+ fn get_bit(&self, bit_index: usize) -> bool {
+ let word_index = bit_index >> 6; // Equivalent to bit_index / 64
let bit_offset = bit_index & 63; // Equivalent to bit_index % 64
let mask = 1u64 << bit_offset;
(self.bit_array[word_index] & mask) != 0
}
/// Sets a single bit and updates the count if it wasn't already set.
- fn set_bit(&mut self, bit_index: u64) {
- let word_index = (bit_index >> 6) as usize; // Equivalent to bit_index
/ 64
+ fn set_bit(&mut self, bit_index: usize) {
+ let word_index = bit_index >> 6; // Equivalent to bit_index / 64
let bit_offset = bit_index & 63; // Equivalent to bit_index % 64
let mask = 1u64 << bit_offset;
@@ -598,6 +599,13 @@ mod tests {
assert_eq!(filter.num_hashes(), 5);
}
+ #[test]
+ fn test_builder_with_size_rounds_to_word_boundary() {
+ let filter = BloomFilterBuilder::with_size(1, 3).build();
+ assert_eq!(filter.capacity(), 64);
+ assert_eq!(filter.num_hashes(), 3);
+ }
+
#[test]
fn test_insert_and_contains() {
let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]