This is an automated email from the ASF dual-hosted git repository.
tison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 6546d66 refactor: export FrequentItemValue and improve docs (#98)
6546d66 is described below
commit 6546d665c629e311b27279c2212a7b92b922fc09
Author: tison <[email protected]>
AuthorDate: Thu Feb 19 19:22:24 2026 +0800
refactor: export FrequentItemValue and improve docs (#98)
Signed-off-by: tison <[email protected]>
---
CHANGELOG.md | 3 +-
datasketches/src/bloom/builder.rs | 16 +--
datasketches/src/bloom/mod.rs | 20 ++--
datasketches/src/bloom/sketch.rs | 30 +++---
datasketches/src/common/binomial_bounds.rs | 52 ++++-----
datasketches/src/countmin/sketch.rs | 8 +-
datasketches/src/frequencies/mod.rs | 67 ++++++++++--
.../src/frequencies/reverse_purge_item_hash_map.rs | 4 +-
datasketches/src/frequencies/sketch.rs | 116 ++++-----------------
datasketches/src/hash/mod.rs | 1 -
datasketches/src/hll/array4.rs | 4 +-
datasketches/src/hll/array8.rs | 4 +-
datasketches/src/hll/estimator.rs | 44 ++++----
datasketches/src/hll/harmonic_numbers.rs | 4 +-
datasketches/src/hll/mod.rs | 26 ++---
datasketches/src/hll/serialization.rs | 4 +-
datasketches/src/hll/sketch.rs | 16 +--
datasketches/src/hll/union.rs | 24 ++---
datasketches/src/theta/bit_pack.rs | 12 +--
datasketches/src/theta/hash_table.rs | 4 +-
datasketches/src/theta/mod.rs | 4 +-
datasketches/src/theta/sketch.rs | 4 +-
datasketches/tests/bloom_serialization_test.rs | 9 --
datasketches/tests/hll_serialization_test.rs | 15 +--
datasketches/tests/hll_union_test.rs | 12 +--
25 files changed, 229 insertions(+), 274 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ab7a242..af1928f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,7 +13,8 @@ All significant changes to this project will be documented in
this file.
* `CountMinSketch` with unsigned values now supports `halve` and `decay`
operations.
* `CpcSketch` and `CpcUnion` are now available for cardinality estimation.
-* `FrequentItemsSketch` now supports serde for `u64` value.
+* `FrequentItemsSketch` now supports serde for any value implement
`FrequentItemValue` (builtin supports for `i64`, `u64`, and `String`).
+* Expose `codec::SketchBytes`, `codec::SketchSlice`, and `FrequentItemValue`
as public API.
## v0.2.0 (2026-01-14)
diff --git a/datasketches/src/bloom/builder.rs
b/datasketches/src/bloom/builder.rs
index 1918a13..6cb0158 100644
--- a/datasketches/src/bloom/builder.rs
+++ b/datasketches/src/bloom/builder.rs
@@ -22,9 +22,9 @@ use crate::hash::DEFAULT_UPDATE_SEED;
/// Builder for creating [`BloomFilter`] instances.
///
/// Provides two construction modes:
-/// - [`with_accuracy()`](Self::with_accuracy): Specify target items and false
positive rate
+/// * [`with_accuracy()`](Self::with_accuracy): Specify target items and false
positive rate
/// (recommended)
-/// - [`with_size()`](Self::with_size): Specify requested bit count and hash
functions (manual)
+/// * [`with_size()`](Self::with_size): Specify requested bit count and hash
functions (manual)
#[derive(Debug, Clone)]
pub struct BloomFilterBuilder {
num_bits: u64,
@@ -52,8 +52,8 @@ impl BloomFilterBuilder {
///
/// # Arguments
///
- /// - `max_items`: Maximum expected number of distinct items
- /// - `fpp`: Target false positive probability (e.g., 0.01 for 1%)
+ /// * `max_items`: Maximum expected number of distinct items
+ /// * `fpp`: Target false positive probability (e.g., 0.01 for 1%)
///
/// # Panics
///
@@ -95,14 +95,14 @@ impl BloomFilterBuilder {
///
/// # Arguments
///
- /// - `num_bits`: Total number of bits in the filter
- /// - `num_hashes`: Number of hash functions to use
+ /// * `num_bits`: Total number of bits in the filter
+ /// * `num_hashes`: Number of hash functions to use
///
/// # Panics
///
/// Panics if any of:
- /// - `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` >
[`Self::MAX_NUM_BITS`]
- /// - `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` >
[`Self::MIN_NUM_HASHES`]
+ /// * `num_bits` < [`Self::MIN_NUM_BITS`] or `num_bits` >
[`Self::MAX_NUM_BITS`]
+ /// * `num_hashes` < [`Self::MIN_NUM_HASHES`] or `num_hashes` >
[`Self::MAX_NUM_HASHES`]
///
/// # Examples
///
diff --git a/datasketches/src/bloom/mod.rs b/datasketches/src/bloom/mod.rs
index e5ac69e..8e58139 100644
--- a/datasketches/src/bloom/mod.rs
+++ b/datasketches/src/bloom/mod.rs
@@ -23,10 +23,10 @@
//!
//! # Properties
//!
-//! - **No false negatives**: If an item was inserted, `contains()` will
always return `true`
-//! - **Possible false positives**: `contains()` may return `true` for items
never inserted
-//! - **Fixed size**: Unlike typical sketches, Bloom filters do not resize
automatically
-//! - **Linear space**: Size is proportional to the expected number of
distinct items
+//! * **No false negatives**: If an item was inserted, `contains()` will
always return `true`
+//! * **Possible false positives**: `contains()` may return `true` for items
never inserted
+//! * **Fixed size**: Unlike typical sketches, Bloom filters do not resize
automatically
+//! * **Linear space**: Size is proportional to the expected number of
distinct items
//!
//! # Usage
//!
@@ -109,15 +109,15 @@
//!
//! # Implementation Details
//!
-//! - Uses XXHash64 for hashing
-//! - Implements double hashing (Kirsch-Mitzenmacher method) for k hash
functions
-//! - Bits packed efficiently in `u64` words
-//! - Compatible serialization format (family ID: 21)
+//! * Uses XXHash64 for hashing
+//! * Implements double hashing (Kirsch-Mitzenmacher method) for k hash
functions
+//! * Bits packed efficiently in `u64` words
+//! * Compatible serialization format (family ID: 21)
//!
//! # References
//!
-//! - Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with
allowable errors"
-//! - Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance:
Building a Better Bloom
+//! * Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with
allowable errors"
+//! * Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance:
Building a Better Bloom
//! Filter"
mod builder;
diff --git a/datasketches/src/bloom/sketch.rs b/datasketches/src/bloom/sketch.rs
index 304c450..d7332e0 100644
--- a/datasketches/src/bloom/sketch.rs
+++ b/datasketches/src/bloom/sketch.rs
@@ -33,9 +33,9 @@ const EMPTY_FLAG_MASK: u8 = 1 << 2;
/// A Bloom filter for probabilistic set membership testing.
///
/// Provides fast membership queries with:
-/// - No false negatives (inserted items always return `true`)
-/// - Tunable false positive rate
-/// - Constant space usage
+/// * No false negatives (inserted items always return `true`)
+/// * Tunable false positive rate
+/// * Constant space usage
///
/// Use [`super::BloomFilterBuilder`] to construct instances.
#[derive(Debug, Clone, PartialEq)]
@@ -54,8 +54,8 @@ impl BloomFilter {
/// Tests whether an item is possibly in the set.
///
/// Returns:
- /// - `true`: Item was **possibly** inserted (or false positive)
- /// - `false`: Item was **definitely not** inserted
+ /// * `true`: Item was **possibly** inserted (or false positive)
+ /// * `false`: Item was **definitely not** inserted
///
/// # Examples
///
@@ -290,8 +290,8 @@ impl BloomFilter {
///
/// Uses the approximation: `load_factor^k`
/// where:
- /// - load_factor = fraction of bits set (bits_used / capacity)
- /// - k = num_hashes
+ /// * load_factor = fraction of bits set (bits_used / capacity)
+ /// * k = num_hashes
///
/// This assumes uniform bit distribution and is more accurate than
/// trying to estimate insertion count from the load factor.
@@ -307,9 +307,9 @@ impl BloomFilter {
/// Checks if two filters are compatible for merging.
///
/// Filters are compatible if they have the same:
- /// - Capacity (number of bits)
- /// - Number of hash functions
- /// - Seed
+ /// * Capacity (number of bits)
+ /// * Number of hash functions
+ /// * Seed
pub fn is_compatible(&self, other: &Self) -> bool {
self.bit_array.len() == other.bit_array.len()
&& self.num_hashes == other.num_hashes
@@ -379,9 +379,9 @@ impl BloomFilter {
/// # Errors
///
/// Returns an error if:
- /// - The data is truncated or corrupted
- /// - The family ID doesn't match (not a Bloom filter)
- /// - The serial version is unsupported
+ /// * The data is truncated or corrupted
+ /// * The family ID doesn't match (not a Bloom filter)
+ /// * The serial version is unsupported
///
/// # Examples
///
@@ -501,8 +501,8 @@ impl BloomFilter {
/// Computes the two base hash values using XXHash64.
///
/// Uses a two-hash approach:
- /// - h0 = XXHash64(item, seed)
- /// - h1 = XXHash64(item, h0)
+ /// * h0 = XXHash64(item, seed)
+ /// * h1 = XXHash64(item, h0)
fn compute_hash<T: Hash>(&self, item: &T) -> (u64, u64) {
// First hash with the configured seed
let mut hasher = XxHash64::with_seed(self.seed);
diff --git a/datasketches/src/common/binomial_bounds.rs
b/datasketches/src/common/binomial_bounds.rs
index afeeddc..c2b7d74 100644
--- a/datasketches/src/common/binomial_bounds.rs
+++ b/datasketches/src/common/binomial_bounds.rs
@@ -274,9 +274,9 @@ static UB_EQUIV_TABLE: [f64; 363] = [
///
/// # Arguments
///
-/// * `num_samples` - The number of samples in the sample set.
-/// * `theta` - The sampling probability. Must be in the range (0.0, 1.0].
-/// * `num_std_dev` - The number of standard deviations for confidence bounds.
+/// * `num_samples`: The number of samples in the sample set.
+/// * `theta`: The sampling probability. Must be in the range (0.0, 1.0].
+/// * `num_std_dev`: The number of standard deviations for confidence bounds.
///
/// # Returns
///
@@ -301,11 +301,11 @@ pub(crate) fn lower_bound(
///
/// # Arguments
///
-/// * `num_samples` - The number of samples in the sample set.
-/// * `theta` - The sampling probability. Must be in the range `(0.0, 1.0]`.
-/// * `num_std_dev` - The number of standard deviations for confidence bounds.
-/// * `no_data_seen` - This is normally false. However, in the case where you
have zero samples and
-/// a theta < 1.0, this flag enables the distinction between a virgin case
when no actual data has
+/// * `num_samples`: The number of samples in the sample set.
+/// * `theta`: The sampling probability. Must be in the range `(0.0, 1.0]`.
+/// * `num_std_dev`: The number of standard deviations for confidence bounds.
+/// * `no_data_seen`: This is normally false. However, in the case where you
have zero samples and a
+/// theta < 1.0, this flag enables the distinction between a virgin case
when no actual data has
/// been seen and the case where the estimate may be zero but an upper error
bound may still
/// exist.
///
@@ -367,16 +367,16 @@ fn cont_classic_ub(num_samples: u64, theta: f64,
num_std_devs: f64) -> f64 {
///
/// # Arguments
///
-/// * `num_samples` - The number of observed samples (k). Must be >= 1.
-/// * `p` - The sampling probability. Must satisfy: 0 < p < 1.
-/// * `delta` - The tail probability. Must satisfy: 0 < delta < 1.
+/// * `num_samples`: The number of observed samples (k). Must be >= 1.
+/// * `p`: The sampling probability. Must satisfy: 0 < p < 1.
+/// * `delta`: The tail probability. Must satisfy: 0 < delta < 1.
///
/// # Invariants
///
-/// - `num_samples >= 1`
-/// - `0.0 < p < 1.0`
-/// - `0.0 < delta < 1.0`
-/// - `(num_samples / p) < 500.0` (enforced for performance and numerical
stability)
+/// * `num_samples >= 1`
+/// * `0.0 < p < 1.0`
+/// * `0.0 < delta < 1.0`
+/// * `(num_samples / p) < 500.0` (enforced for performance and numerical
stability)
///
/// # Returns
///
@@ -413,15 +413,15 @@ fn special_n_star(num_samples: u64, p: f64, delta: f64)
-> Result<u64, Error> {
///
/// # Arguments
///
-/// * `num_samples` - The number of observed samples (k). Must be >= 1.
-/// * `p` - The sampling probability. Must satisfy: 0 < p < 1.
-/// * `delta` - The tail probability. Must satisfy: 0 < delta < 1.
+/// * `num_samples`: The number of observed samples (k). Must be >= 1.
+/// * `p`: The sampling probability. Must satisfy: 0 < p < 1.
+/// * `delta`: The tail probability. Must satisfy: 0 < delta < 1.
///
/// # Invariants
///
-/// - `num_samples >= 1`
-/// - `0.0 < p < 1.0`
-/// - `0.0 < delta < 1.0`
+/// * `num_samples >= 1`
+/// * `0.0 < p < 1.0`
+/// * `0.0 < delta < 1.0`
///
/// # Returns
///
@@ -452,14 +452,14 @@ fn special_n_prime_b(num_samples: u64, p: f64, delta:
f64) -> Result<u64, Error>
///
/// # Arguments
///
-/// * `num_samples` - The number of observed samples (k). Must be >= 1.
-/// * `p` - The sampling probability. Must satisfy: 0 < p < 1.
-/// * `delta` - The tail probability. Must satisfy: 0 < delta < 1.
+/// * `num_samples`: The number of observed samples (k). Must be >= 1.
+/// * `p`: The sampling probability. Must satisfy: 0 < p < 1.
+/// * `delta`: The tail probability. Must satisfy: 0 < delta < 1.
///
/// # Invariants
///
-/// - `(num_samples / p) < 500.0` (enforced for performance)
-/// - A super-small delta could also make it slow.
+/// * `(num_samples / p) < 500.0` (enforced for performance)
+/// * A super-small delta could also make it slow.
fn special_n_prime_f(num_samples: u64, p: f64, delta: f64) -> Result<u64,
Error> {
// Use a different algorithm if the following is true; this one will be
too slow, or worse.
if (num_samples as f64 / p) >= 500.0 {
diff --git a/datasketches/src/countmin/sketch.rs
b/datasketches/src/countmin/sketch.rs
index 3bc50a3..2116b75 100644
--- a/datasketches/src/countmin/sketch.rs
+++ b/datasketches/src/countmin/sketch.rs
@@ -75,10 +75,10 @@ impl<T: CountMinValue> CountMinSketch<T> {
/// # Panics
///
/// Panics if any of:
- /// - `num_hashes` is 0
- /// - `num_buckets` is less than 3
- /// - the total table size exceeds the supported limit
- /// - the computed seed hash is zero
+ /// * `num_hashes` is 0
+ /// * `num_buckets` is less than 3
+ /// * the total table size exceeds the supported limit
+ /// * the computed seed hash is zero
///
/// # Examples
///
diff --git a/datasketches/src/frequencies/mod.rs
b/datasketches/src/frequencies/mod.rs
index 93fb5e4..d5e7cf0 100644
--- a/datasketches/src/frequencies/mod.rs
+++ b/datasketches/src/frequencies/mod.rs
@@ -17,16 +17,66 @@
//! Frequency sketches for finding heavy hitters in data streams.
//!
-//! This module implements the Frequent Items sketch from Apache DataSketches.
It tracks
-//! approximate frequencies in a stream and can report heavy hitters with
explicit
-//! error guarantees (no false negatives or no false positives).
+//! # Overview
//!
-//! For background, see the Java documentation:
-//!
<https://apache.github.io/datasketches-java/9.0.0/org/apache/datasketches/frequencies/FrequentItemsSketch.html>
+//! This sketch is based on the paper ["A High-Performance Algorithm for
Identifying Frequent Items
+//! in Data Streams"](https://arxiv.org/abs/1705.07001) by Daniel Anderson,
Pryce Bevan, Kevin Lang,
+//! Edo Liberty, Lee Rhodes, and Justin Thaler.
//!
-//! # Usage
+//! This sketch is useful for tracking approximate frequencies of items of
type `T` that implements
+//! [`FrequentItemValue`], with optional associated counts (`T` item, `u64`
count) that are members
+//! of a multiset of such items. The true frequency of an item is defined to
be the sum of
+//! associated counts.
//!
-//! ```rust
+//! This implementation provides the following capabilities:
+//! * Estimate the frequency of an item.
+//! * Return upper and lower bounds of any item, such that the true frequency
is always between the
+//! upper and lower bounds.
+//! * Return a global maximum error that holds for all items in the stream.
+//! * Return an array of frequent items that qualify either
[`ErrorType::NoFalsePositives`] or
+//! [`ErrorType::NoFalseNegatives`].
+//! * Merge itself with another sketch created from this module.
+//! * Serialize to bytes, or deserialize from bytes, for storage or
transmission.
+//!
+//! # Accuracy
+//!
+//! If fewer than `0.75 * max_map_size` different items are inserted into the
sketch the estimated
+//! frequencies returned by the sketch will be exact.
+//!
+//! The logic of the frequent items sketch is such that the stored counts and
true counts are never
+//! too different. More specifically, for any item, the sketch can return an
estimate of the true
+//! frequency of item, along with upper and lower bounds on the frequency
(that hold
+//! deterministically).
+//!
+//! For this implementation and for a specific active item, it is guaranteed
that the true frequency
+//! will be between the Upper Bound (UB) and the Lower Bound (LB) computed for
that item.
+//! Specifically, `(UB - LB) ≤ W * epsilon`, where `W` denotes the sum of all
item counts, and
+//! `epsilon = 3.5/M`, where `M` is the `max_map_size`.
+//!
+//! This is the worst case guarantee that applies to arbitrary inputs. [^1]
+//! For inputs typically seen in practice (`UB - LB`) is usually much smaller.
+//!
+//! [^1]: For speed we do employ some randomization that introduces a small
probability that our
+//! proof of the worst-case bound might not apply to a given run. However, we
have ensured that this
+//! probability is extremely small. For example, if the stream causes one
table purge (rebuild),
+//! our proof of the worst case bound applies with probability at least `1 -
1E-14`. If the stream
+//! causes `1E9` purges, our proof applies with probability at least `1 -
1E-5`.
+//!
+//! # Background
+//!
+//! This code implements a variant of what is commonly known as the
"Misra-Gries algorithm".
+//! Variants of it were discovered and rediscovered and redesigned several
times over the years:
+//! * "Finding repeated elements", Misra, Gries, 1982
+//! * "Frequency estimation of Internet packet streams with limited space"
Demaine, Lopez-Ortiz,
+//! Munro, 2002
+//! * "A simple algorithm for finding frequent elements in streams and bags"
Karp, Shenker,
+//! Papadimitriou, 2003
+//! * "Efficient Computation of Frequent and Top-k Elements in Data Streams"
Metwally, Agrawal,
+//! Abbadi, 2006
+//!
+//! # Examples
+//!
+//! ```
//! # use datasketches::frequencies::ErrorType;
//! # use datasketches::frequencies::FrequentItemsSketch;
//! let mut sketch = FrequentItemsSketch::<i64>::new(64);
@@ -38,7 +88,7 @@
//!
//! # Serialization
//!
-//! ```rust
+//! ```
//! # use datasketches::frequencies::FrequentItemsSketch;
//! let mut sketch = FrequentItemsSketch::<i64>::new(64);
//! sketch.update_with_count(42, 2);
@@ -52,6 +102,7 @@ mod reverse_purge_item_hash_map;
mod serialization;
mod sketch;
+pub use self::serialization::FrequentItemValue;
pub use self::sketch::ErrorType;
pub use self::sketch::FrequentItemsSketch;
pub use self::sketch::Row;
diff --git a/datasketches/src/frequencies/reverse_purge_item_hash_map.rs
b/datasketches/src/frequencies/reverse_purge_item_hash_map.rs
index f934b87..79ed290 100644
--- a/datasketches/src/frequencies/reverse_purge_item_hash_map.rs
+++ b/datasketches/src/frequencies/reverse_purge_item_hash_map.rs
@@ -192,7 +192,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
T: Clone,
{
if self.num_active == 0 {
- return Vec::new();
+ return vec![];
}
let mut keys = Vec::with_capacity(self.num_active);
for i in 0..self.keys.len() {
@@ -208,7 +208,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
/// Returns the active values in the map.
pub fn active_values(&self) -> Vec<u64> {
if self.num_active == 0 {
- return Vec::new();
+ return vec![];
}
let mut values = Vec::with_capacity(self.num_active);
for i in 0..self.values.len() {
diff --git a/datasketches/src/frequencies/sketch.rs
b/datasketches/src/frequencies/sketch.rs
index 13c79f9..83de1cf 100644
--- a/datasketches/src/frequencies/sketch.rs
+++ b/datasketches/src/frequencies/sketch.rs
@@ -85,7 +85,7 @@ impl<T> Row<T> {
/// The sketch tracks approximate item frequencies and can return estimates
with
/// guaranteed upper and lower bounds.
///
-/// See [`crate::frequencies`] for an overview and error guarantees.
+/// See the [module level documentation](super) for an overview and error
guarantees.
#[derive(Debug, Clone)]
pub struct FrequentItemsSketch<T> {
lg_max_map_size: u8,
@@ -296,7 +296,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
/// Returns frequent items using the sketch maximum error as threshold.
///
- /// This is equivalent to
`frequent_items_with_threshold(self.maximum_error(), error_type)`.
+ /// This is equivalent to `frequent_items_with_threshold(error_type,
self.maximum_error())`.
///
/// # Examples
///
@@ -343,7 +343,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
T: Clone,
{
let threshold = threshold.max(self.offset);
- let mut rows = Vec::new();
+ let mut rows = vec![];
for (item, count) in self.hash_map.iter() {
let lower = count;
let upper = count + self.offset;
@@ -510,74 +510,36 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
}
}
-impl FrequentItemsSketch<i64> {
+impl<T: FrequentItemValue> FrequentItemsSketch<T> {
/// Serializes this sketch into a byte vector.
///
/// # Examples
///
- /// ```
- /// # use datasketches::frequencies::FrequentItemsSketch;
- /// # let mut sketch = FrequentItemsSketch::<i64>::new(64);
- /// # sketch.update_with_count(7, 2);
- /// let bytes = sketch.serialize();
- /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
- /// assert!(decoded.estimate(&7) >= 2);
- /// ```
- pub fn serialize(&self) -> Vec<u8> {
- self.serialize_inner(
- |items| items.iter().map(i64::serialize_size).sum(),
- |bytes, items| {
- for item in items {
- item.serialize_value(bytes);
- }
- },
- )
- }
-
- /// Deserializes a sketch from bytes.
- ///
- /// # Examples
+ /// Built-in support for `i64`:
///
/// ```
/// # use datasketches::frequencies::FrequentItemsSketch;
/// # let mut sketch = FrequentItemsSketch::<i64>::new(64);
/// # sketch.update_with_count(7, 2);
- /// # let bytes = sketch.serialize();
+ /// let bytes = sketch.serialize();
/// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
/// assert!(decoded.estimate(&7) >= 2);
/// ```
- pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
- Self::deserialize_inner(bytes, |mut cursor, num_items| {
- let mut items = Vec::with_capacity(num_items);
- for i in 0..num_items {
- let item = i64::deserialize_value(&mut cursor).map_err(|_| {
- Error::insufficient_data(format!(
- "expected {num_items} items, failed to read item at
index {i}"
- ))
- })?;
- items.push(item);
- }
- Ok(items)
- })
- }
-}
-
-impl FrequentItemsSketch<u64> {
- /// Serializes this sketch into a byte vector.
///
- /// # Examples
+ /// Built-in support for `String`:
///
/// ```
/// # use datasketches::frequencies::FrequentItemsSketch;
- /// # let mut sketch = FrequentItemsSketch::<i64>::new(64);
- /// # sketch.update_with_count(7, 2);
+ /// # let mut sketch = FrequentItemsSketch::<String>::new(64);
+ /// # let apple = "apple".to_string();
+ /// # sketch.update_with_count(apple.clone(), 2);
/// let bytes = sketch.serialize();
- /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
- /// assert!(decoded.estimate(&7) >= 2);
+ /// let decoded =
FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
+ /// assert!(decoded.estimate(&apple) >= 2);
/// ```
pub fn serialize(&self) -> Vec<u8> {
self.serialize_inner(
- |items| items.iter().map(u64::serialize_size).sum(),
+ |items| items.iter().map(T::serialize_size).sum(),
|bytes, items| {
for item in items {
item.serialize_value(bytes);
@@ -590,58 +552,18 @@ impl FrequentItemsSketch<u64> {
///
/// # Examples
///
+ /// Built-in support for `i64`:
+ ///
/// ```
/// # use datasketches::frequencies::FrequentItemsSketch;
- /// # let mut sketch = FrequentItemsSketch::<u64>::new(64);
+ /// # let mut sketch = FrequentItemsSketch::<i64>::new(64);
/// # sketch.update_with_count(7, 2);
/// # let bytes = sketch.serialize();
- /// let decoded = FrequentItemsSketch::<u64>::deserialize(&bytes).unwrap();
+ /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
/// assert!(decoded.estimate(&7) >= 2);
/// ```
- pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
- Self::deserialize_inner(bytes, |mut cursor, num_items| {
- let mut items = Vec::with_capacity(num_items);
- for i in 0..num_items {
- let item = u64::deserialize_value(&mut cursor).map_err(|_| {
- Error::insufficient_data(format!(
- "expected {num_items} items, failed to read item at
index {i}"
- ))
- })?;
- items.push(item);
- }
- Ok(items)
- })
- }
-}
-
-impl FrequentItemsSketch<String> {
- /// Serializes this sketch into a byte vector.
- ///
- /// # Examples
- ///
- /// ```
- /// # use datasketches::frequencies::FrequentItemsSketch;
- /// # let mut sketch = FrequentItemsSketch::<String>::new(64);
- /// # let apple = "apple".to_string();
- /// # sketch.update_with_count(apple.clone(), 2);
- /// let bytes = sketch.serialize();
- /// let decoded =
FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
- /// assert!(decoded.estimate(&apple) >= 2);
- /// ```
- pub fn serialize(&self) -> Vec<u8> {
- self.serialize_inner(
- |items| items.iter().map(String::serialize_size).sum(),
- |bytes, items| {
- for item in items {
- item.serialize_value(bytes);
- }
- },
- )
- }
-
- /// Deserializes a sketch from bytes.
///
- /// # Examples
+ /// Built-in support for `String`:
///
/// ```
/// # use datasketches::frequencies::FrequentItemsSketch;
@@ -656,7 +578,7 @@ impl FrequentItemsSketch<String> {
Self::deserialize_inner(bytes, |mut cursor, num_items| {
let mut items = Vec::with_capacity(num_items);
for i in 0..num_items {
- let item = String::deserialize_value(&mut cursor).map_err(|_| {
+ let item = T::deserialize_value(&mut cursor).map_err(|_| {
Error::insufficient_data(format!(
"expected {num_items} items, failed to read item at
index {i}"
))
diff --git a/datasketches/src/hash/mod.rs b/datasketches/src/hash/mod.rs
index 87eaf22..99d2cca 100644
--- a/datasketches/src/hash/mod.rs
+++ b/datasketches/src/hash/mod.rs
@@ -19,7 +19,6 @@ mod murmurhash;
mod xxhash;
pub(crate) use self::murmurhash::MurmurHash3X64128;
-#[allow(unused_imports)]
pub(crate) use self::xxhash::XxHash64;
/// The seed 9001 used in the sketch update methods is a prime number that was
chosen very early
diff --git a/datasketches/src/hll/array4.rs b/datasketches/src/hll/array4.rs
index a17b4da..073c335 100644
--- a/datasketches/src/hll/array4.rs
+++ b/datasketches/src/hll/array4.rs
@@ -79,8 +79,8 @@ impl Array4 {
/// Get the actual value at a slot (adjusted for cur_min and aux_map)
///
/// Returns the true register value:
- /// - If raw < 15: value = cur_min + raw
- /// - If raw == 15 (AUX_TOKEN): value is in aux_map
+ /// * If raw < 15: value = cur_min + raw
+ /// * If raw == 15 (AUX_TOKEN): value is in aux_map
pub(super) fn get(&self, slot: u32) -> u8 {
let raw = self.get_raw(slot);
diff --git a/datasketches/src/hll/array8.rs b/datasketches/src/hll/array8.rs
index 00faf16..2bd1509 100644
--- a/datasketches/src/hll/array8.rs
+++ b/datasketches/src/hll/array8.rs
@@ -187,8 +187,8 @@ impl Array8 {
///
/// # Parameters
///
- /// * `src` - Source register values (length must be 2^src_lg_k)
- /// * `src_lg_k` - Log2 of source register count
+ /// * `src`: Source register values (length must be 2^src_lg_k)
+ /// * `src_lg_k`: Log2 of source register count
///
/// # Panics
///
diff --git a/datasketches/src/hll/estimator.rs
b/datasketches/src/hll/estimator.rs
index a9bd63b..7c9ca08 100644
--- a/datasketches/src/hll/estimator.rs
+++ b/datasketches/src/hll/estimator.rs
@@ -33,9 +33,9 @@ use crate::hll::harmonic_numbers;
/// allowing it to be composed into Array4, Array6, and Array8.
///
/// The estimator supports two modes:
-/// - **In-order mode**: Uses HIP (Historical Inverse Probability) accumulator
for accurate
+/// * **In-order mode**: Uses HIP (Historical Inverse Probability) accumulator
for accurate
/// sequential updates
-/// - **Out-of-order mode**: Uses composite estimator (raw HLL + linear
counting) after
+/// * **Out-of-order mode**: Uses composite estimator (raw HLL + linear
counting) after
/// deserialization or merging
#[derive(Debug, Clone, PartialEq)]
pub struct HipEstimator {
@@ -71,8 +71,8 @@ impl HipEstimator {
/// 2. Update KxQ registers (always)
///
/// The KxQ registers are split for numerical precision:
- /// - kxq0: sum of 1/2^v for v < 32
- /// - kxq1: sum of 1/2^v for v >= 32
+ /// * kxq0: sum of 1/2^v for v < 32
+ /// * kxq1: sum of 1/2^v for v >= 32
pub fn update(&mut self, lg_config_k: u8, old_value: u8, new_value: u8) {
let k = (1 << lg_config_k) as f64;
@@ -109,9 +109,9 @@ impl HipEstimator {
///
/// # Arguments
///
- /// * `lg_config_k` - Log2 of number of registers (k)
- /// * `cur_min` - Current minimum register value (for Array4, 0 for
Array6/8)
- /// * `num_at_cur_min` - Number of registers at cur_min value
+ /// * `lg_config_k`: Log2 of number of registers (k)
+ /// * `cur_min`: Current minimum register value (for Array4, 0 for
Array6/8)
+ /// * `num_at_cur_min`: Number of registers at cur_min value
pub fn estimate(&self, lg_config_k: u8, cur_min: u8, num_at_cur_min: u32)
-> f64 {
if self.out_of_order {
self.get_composite_estimate(lg_config_k, cur_min, num_at_cur_min)
@@ -126,10 +126,10 @@ impl HipEstimator {
///
/// # Arguments
///
- /// * `lg_config_k` - Log2 of number of registers (k)
- /// * `cur_min` - Current minimum register value (for Array4, 0 for
Array6/8)
- /// * `num_at_cur_min` - Number of registers at cur_min value
- /// * `num_std_dev` - Number of standard deviations (1, 2, or 3)
+ /// * `lg_config_k`: Log2 of number of registers (k)
+ /// * `cur_min`: Current minimum register value (for Array4, 0 for
Array6/8)
+ /// * `num_at_cur_min`: Number of registers at cur_min value
+ /// * `num_std_dev`: Number of standard deviations (1, 2, or 3)
pub fn upper_bound(
&self,
lg_config_k: u8,
@@ -149,10 +149,10 @@ impl HipEstimator {
///
/// # Arguments
///
- /// * `lg_config_k` - Log2 of number of registers (k)
- /// * `cur_min` - Current minimum register value (for Array4, 0 for
Array6/8)
- /// * `num_at_cur_min` - Number of registers at cur_min value
- /// * `num_std_dev` - Number of standard deviations (1, 2, or 3)
+ /// * `lg_config_k`: Log2 of number of registers (k)
+ /// * `cur_min`: Current minimum register value (for Array4, 0 for
Array6/8)
+ /// * `num_at_cur_min`: Number of registers at cur_min value
+ /// * `num_std_dev`: Number of standard deviations (1, 2, or 3)
pub fn lower_bound(
&self,
lg_config_k: u8,
@@ -286,8 +286,8 @@ impl HipEstimator {
/// Set the out-of-order flag
///
/// This should be set to true when:
- /// - Deserializing a sketch from bytes
- /// - After a merge/union operation
+ /// * Deserializing a sketch from bytes
+ /// * After a merge/union operation
pub fn set_out_of_order(&mut self, ooo: bool) {
self.out_of_order = ooo;
if ooo {
@@ -331,10 +331,10 @@ fn inv_pow2(value: u8) -> f64 {
///
/// # Arguments
///
-/// * `lg_config_k` - Log2 of number of registers (must be 4-21)
-/// * `upper_bound` - Whether computing upper bound (vs lower bound)
-/// * `ooo` - Whether sketch is out-of-order (merged/deserialized)
-/// * `num_std_dev` - Number of standard deviations (1, 2, or 3)
+/// * `lg_config_k`: Log2 of number of registers (must be 4-21)
+/// * `upper_bound`: Whether computing upper bound (vs lower bound)
+/// * `ooo`: Whether sketch is out-of-order (merged/deserialized)
+/// * `num_std_dev`: Number of standard deviations (1, 2, or 3)
///
/// # Returns
///
@@ -357,7 +357,7 @@ fn get_rel_err(lg_config_k: u8, upper_bound: bool, ooo:
bool, num_std_dev: NumSt
return sign * (num_std_dev as u8 as f64) * rse_factor / k.sqrt();
}
- // For lg_k <= 12, use empirically measured lookup tables
+ // For lg_k <= 12, use empirically measured lookup tables.
// Tables are indexed by: ((lg_k - 4) * 3) + (num_std_dev - 1)
let idx = ((lg_config_k as usize) - 4) * 3 + ((num_std_dev as usize) - 1);
diff --git a/datasketches/src/hll/harmonic_numbers.rs
b/datasketches/src/hll/harmonic_numbers.rs
index cdc4161..dea7141 100644
--- a/datasketches/src/hll/harmonic_numbers.rs
+++ b/datasketches/src/hll/harmonic_numbers.rs
@@ -86,8 +86,8 @@ fn harmonic_number(n: usize) -> f64 {
///
/// # Arguments
///
-/// * `bit_vector_length` - Total length of bit vector (k for HLL)
-/// * `num_bits_set` - Number of bits set (non-zero registers)
+/// * `bit_vector_length`: Total length of bit vector (k for HLL)
+/// * `num_bits_set`: Number of bits set (non-zero registers)
///
/// # Returns
///
diff --git a/datasketches/src/hll/mod.rs b/datasketches/src/hll/mod.rs
index f9476fe..6f99a49 100644
--- a/datasketches/src/hll/mod.rs
+++ b/datasketches/src/hll/mod.rs
@@ -26,9 +26,9 @@
//! This implementation follows the Apache DataSketches specification and
supports multiple
//! storage modes that automatically adapt based on cardinality:
//!
-//! - **List mode**: Stores individual values for small cardinalities
-//! - **Set mode**: Uses a hash set for medium cardinalities
-//! - **HLL mode**: Uses compact arrays for large cardinalities
+//! * **List mode**: Stores individual values for small cardinalities
+//! * **Set mode**: Uses a hash set for medium cardinalities
+//! * **HLL mode**: Uses compact arrays for large cardinalities
//!
//! Mode transitions are automatic and transparent to the user. Each promotion
preserves
//! all previously observed values and maintains estimation accuracy.
@@ -44,9 +44,9 @@
//!
//! Three target HLL types are supported, trading precision for memory:
//!
-//! - [`HllType::Hll4`]: 4 bits per bucket (most compact)
-//! - [`HllType::Hll6`]: 6 bits per bucket (balanced)
-//! - [`HllType::Hll8`]: 8 bits per bucket (highest precision)
+//! * [`HllType::Hll4`]: 4 bits per bucket (most compact)
+//! * [`HllType::Hll6`]: 6 bits per bucket (balanced)
+//! * [`HllType::Hll8`]: 8 bits per bucket (highest precision)
//!
//! # Union Operations
//!
@@ -54,9 +54,9 @@
//! It maintains an internal "gadget" sketch that accumulates the union of all
input sketches
//! and automatically handles:
//!
-//! - Sketches with different `lg_k` precision levels (resizes/downsamples as
needed)
-//! - Sketches in different modes (List, Set, or Array)
-//! - Sketches with different target HLL types
+//! * Sketches with different `lg_k` precision levels (resizes/downsamples as
needed)
+//! * Sketches in different modes (List, Set, or Array)
+//! * Sketches with different target HLL types
//!
//! The union operation preserves cardinality estimation accuracy while
enabling distributed
//! computation patterns where sketches are built independently and merged
later.
@@ -64,10 +64,10 @@
//! # Serialization
//!
//! Sketches can be serialized and deserialized while preserving all state,
including:
-//! - Current mode and HLL type
-//! - All observed values (coupons or register values)
-//! - HIP accumulator state for accurate estimation
-//! - Out-of-order flag for merged/deserialized sketches
+//! * Current mode and HLL type
+//! * All observed values (coupons or register values)
+//! * HIP accumulator state for accurate estimation
+//! * Out-of-order flag for merged/deserialized sketches
//!
//! The serialization format is compatible with Apache DataSketches
implementations
//! in Java and C++, enabling cross-platform sketch exchange.
diff --git a/datasketches/src/hll/serialization.rs
b/datasketches/src/hll/serialization.rs
index 014b890..30740a9 100644
--- a/datasketches/src/hll/serialization.rs
+++ b/datasketches/src/hll/serialization.rs
@@ -64,8 +64,8 @@ pub fn extract_tgt_hll_type(mode_byte: u8) -> u8 {
///
/// # Arguments
///
-/// * `cur_mode` - 0 = LIST, 1 = SET, 2 = HLL
-/// * `tgt_type` - 0 = HLL4, 1 = HLL6, 2 = HLL8
+/// * `cur_mode`: 0 = LIST, 1 = SET, 2 = HLL
+/// * `tgt_type`: 0 = HLL4, 1 = HLL6, 2 = HLL8
#[inline]
pub fn encode_mode_byte(cur_mode: u8, tgt_type: u8) -> u8 {
(cur_mode & 0x3) | ((tgt_type & 0x3) << 2)
diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs
index 484e16a..ecf3ff1 100644
--- a/datasketches/src/hll/sketch.rs
+++ b/datasketches/src/hll/sketch.rs
@@ -54,15 +54,15 @@ impl HllSketch {
///
/// # Arguments
///
- /// * `lg_config_k` - Log2 of the number of buckets (K). Must be in [4,
21].
- /// - lg_k=4: 16 buckets, ~26% relative error
- /// - lg_k=12: 4096 buckets, ~1.6% relative error (common choice)
- /// - lg_k=21: 2M buckets, ~0.4% relative error
- /// * `hll_type` - Target HLL array type (Hll4, Hll6, or Hll8)
+ /// * `lg_config_k`: Log2 of the number of buckets (K). Must be in `[4,
21]`.
+ /// * lg_k=4: 16 buckets, ~26% relative error
+ /// * lg_k=12: 4096 buckets, ~1.6% relative error (common choice)
+ /// * lg_k=21: 2M buckets, ~0.4% relative error
+ /// * `hll_type`: Target HLL array type (Hll4, Hll6, or Hll8)
///
/// # Panics
///
- /// If lg_config_k is not in range [4, 21]
+ /// If lg_config_k is not in range `[4, 21]`
///
/// # Examples
///
@@ -94,8 +94,8 @@ impl HllSketch {
///
/// # Arguments
///
- /// * `lg_config_k` - Log2 of the number of buckets (K)
- /// * `mode` - The mode to initialize the sketch with
+ /// * `lg_config_k`: Log2 of the number of buckets (K)
+ /// * `mode`: The mode to initialize the sketch with
pub(super) fn from_mode(lg_config_k: u8, mode: Mode) -> Self {
Self { lg_config_k, mode }
}
diff --git a/datasketches/src/hll/union.rs b/datasketches/src/hll/union.rs
index 03fb4ea..5f3929d 100644
--- a/datasketches/src/hll/union.rs
+++ b/datasketches/src/hll/union.rs
@@ -24,9 +24,9 @@
//!
//! The union maintains an internal "gadget" sketch that accumulates the union
//! of all input sketches. It can handle sketches with:
-//! - Different lg_k values (automatically resizes as needed)
-//! - Different modes (List, Set, Array4/6/8)
-//! - Different target HLL types
+//! * Different lg_k values (automatically resizes as needed)
+//! * Different modes (List, Set, Array4/6/8)
+//! * Different target HLL types
use std::hash::Hash;
@@ -59,13 +59,13 @@ impl HllUnion {
///
/// # Arguments
///
- /// * `lg_max_k` - Maximum log2 of the number of buckets. Must be in [4,
21]. This determines
+ /// * `lg_max_k`: Maximum log2 of the number of buckets. Must be in `[4,
21]`. This determines
/// the maximum precision the union can handle. Input sketches with
larger lg_k will be
/// down-sampled.
///
/// # Panics
///
- /// Panics if `lg_max_k` is not in the range [4, 21].
+ /// Panics if `lg_max_k` is not in the range `[4, 21]`.
///
/// # Examples
///
@@ -110,9 +110,9 @@ impl HllUnion {
/// Update the union with another sketch
///
/// Merges the input sketch into the union's internal gadget, handling:
- /// - Sketches with different lg_k values (resizes/downsamples as needed)
- /// - Sketches in different modes (List, Set, Array4/6/8)
- /// - Sketches with different target HLL types
+ /// * Sketches with different lg_k values (resizes/downsamples as needed)
+ /// * Sketches in different modes (List, Set, Array4/6/8)
+ /// * Sketches with different target HLL types
///
/// # Examples
///
@@ -244,7 +244,7 @@ impl HllUnion {
///
/// # Arguments
///
- /// * `hll_type` - The target HLL type for the result sketch (Hll4, Hll6,
or Hll8)
+ /// * `hll_type`: The target HLL type for the result sketch (Hll4, Hll6,
or Hll8)
///
/// # Examples
///
@@ -401,9 +401,9 @@ fn merge_coupons_into_mode(dst: &mut Array8, src_mode:
&Mode) {
/// Merge an HLL array into an Array8
///
/// Handles merging from Array4, Array6, or Array8 sources. Dispatches based
on lg_k:
-/// - Same lg_k: optimized bulk merge
-/// - src lg_k > dst lg_k: downsample src into dst
-/// - src lg_k < dst lg_k: handled by caller (requires gadget replacement)
+/// * Same lg_k: optimized bulk merge
+/// * src lg_k > dst lg_k: downsample src into dst
+/// * src lg_k < dst lg_k: handled by caller (requires gadget replacement)
fn merge_array_into_array8(dst_array8: &mut Array8, dst_lg_k: u8, src_mode:
&Mode, src_lg_k: u8) {
assert!(
src_lg_k >= dst_lg_k,
diff --git a/datasketches/src/theta/bit_pack.rs
b/datasketches/src/theta/bit_pack.rs
index 031afce..2a59351 100644
--- a/datasketches/src/theta/bit_pack.rs
+++ b/datasketches/src/theta/bit_pack.rs
@@ -4972,9 +4972,9 @@ fn unpack_bits_63(values: &mut [u64], bytes: &[u8]) {
///
/// # Panics
///
-/// - Panics if `values.len()` is not equal to `BLOCK_WIDTH`.
-/// - Panics if `bits` is not in the range `1..=63`.
-/// - Panics if `bytes.len()` is less than `bits * BLOCK_WIDTH`.
+/// * Panics if `values.len()` is not equal to `BLOCK_WIDTH`.
+/// * Panics if `bits` is not in the range `1..=63`.
+/// * Panics if `bytes.len()` is less than `bits * BLOCK_WIDTH`.
pub(crate) fn pack_bits_block(values: &[u64], bytes: &mut [u8], bits: u8) {
assert_eq!(values.len(), BLOCK_WIDTH, "values length must be 8");
assert!(
@@ -5058,9 +5058,9 @@ pub(crate) fn pack_bits_block(values: &[u64], bytes: &mut
[u8], bits: u8) {
///
/// # Panics
///
-/// - Panics if `values.len()` is not equal to `BLOCK_WIDTH`.
-/// - Panics if `bits` is not in the range `1..=63`.
-/// - Panics if `bytes.len()` is less than `bits * BLOCK_WIDTH`.
+/// * Panics if `values.len()` is not equal to `BLOCK_WIDTH`.
+/// * Panics if `bits` is not in the range `1..=63`.
+/// * Panics if `bytes.len()` is less than `bits * BLOCK_WIDTH`.
pub(crate) fn unpack_bits_block(values: &mut [u64], bytes: &[u8], bits: u8) {
assert_eq!(values.len(), BLOCK_WIDTH, "values length must be 8");
assert!(
diff --git a/datasketches/src/theta/hash_table.rs
b/datasketches/src/theta/hash_table.rs
index d77304e..b06c695 100644
--- a/datasketches/src/theta/hash_table.rs
+++ b/datasketches/src/theta/hash_table.rs
@@ -587,7 +587,7 @@ mod tests {
let mut table = ThetaHashTable::new(8, ResizeFactor::X8, 1.0,
DEFAULT_UPDATE_SEED);
// Insert some values
- let mut inserted_hashes = Vec::new();
+ let mut inserted_hashes = vec![];
for i in 0..10 {
let hash = table.hash_and_screen(format!("value_{}", i));
if hash != 0 && table.try_insert(hash) {
@@ -633,7 +633,7 @@ mod tests {
// Insert many values to trigger rebuild
let mut i = 0;
- let mut inserted_hashes = Vec::new();
+ let mut inserted_hashes = vec![];
loop {
let hash = table.hash_and_screen(format!("value_{}", i));
i += 1;
diff --git a/datasketches/src/theta/mod.rs b/datasketches/src/theta/mod.rs
index 1d33a71..fdde037 100644
--- a/datasketches/src/theta/mod.rs
+++ b/datasketches/src/theta/mod.rs
@@ -27,8 +27,8 @@
//! Theta sketches provide approximate distinct count (cardinality) estimation
with
//! configurable accuracy and memory usage. The implementation supports:
//!
-//! - **ThetaSketch**: Mutable sketch for building from input data
-//! - **CompactThetaSketch**: Immutable sketch with compact memory layout
+//! * **ThetaSketch**: Mutable sketch for building from input data
+//! * **CompactThetaSketch**: Immutable sketch with compact memory layout
//!
//! # Usage
//!
diff --git a/datasketches/src/theta/sketch.rs b/datasketches/src/theta/sketch.rs
index a56d3c4..32f6e9a 100644
--- a/datasketches/src/theta/sketch.rs
+++ b/datasketches/src/theta/sketch.rs
@@ -236,7 +236,7 @@ impl ThetaSketch {
///
/// # Arguments
///
- /// * `num_std_dev` - The number of standard deviations for confidence
bounds.
+ /// * `num_std_dev`: The number of standard deviations for confidence
bounds.
///
/// # Examples
///
@@ -270,7 +270,7 @@ impl ThetaSketch {
///
/// # Arguments
///
- /// * `num_std_dev` - The number of standard deviations for confidence
bounds.
+ /// * `num_std_dev`: The number of standard deviations for confidence
bounds.
///
/// # Examples
///
diff --git a/datasketches/tests/bloom_serialization_test.rs
b/datasketches/tests/bloom_serialization_test.rs
index 5370f89..15daba2 100644
--- a/datasketches/tests/bloom_serialization_test.rs
+++ b/datasketches/tests/bloom_serialization_test.rs
@@ -15,15 +15,6 @@
// specific language governing permissions and limitations
// under the License.
-//! Bloom Filter Serialization Compatibility Tests
-//!
-//! These tests verify binary compatibility with Apache DataSketches
implementations:
-//! - Java (datasketches-java)
-//! - C++ (datasketches-cpp)
-//!
-//! Test data is generated by the reference implementations and stored in:
-//! `tests/serialization_test_data/`
-
mod common;
use std::fs;
diff --git a/datasketches/tests/hll_serialization_test.rs
b/datasketches/tests/hll_serialization_test.rs
index 9c8200f..a7e00e6 100644
--- a/datasketches/tests/hll_serialization_test.rs
+++ b/datasketches/tests/hll_serialization_test.rs
@@ -15,15 +15,6 @@
// specific language governing permissions and limitations
// under the License.
-//! HLL Sketch Serialization Compatibility Tests
-//!
-//! These tests verify binary compatibility with Apache DataSketches
implementations:
-//! - Java (datasketches-java)
-//! - C++ (datasketches-cpp)
-//!
-//! Test data is generated by the reference implementations and stored in:
-//! `tests/serialization_test_data/`
-
mod common;
use std::fs;
@@ -48,9 +39,9 @@ fn test_sketch_file(path: PathBuf, expected_cardinality:
usize, expected_lg_k: u
// Check cardinality estimate with error bounds
// For lg_k=12, theoretical RSE ≈ 1.625%, but we use 2% margin to account
for:
- // - Small sample sizes (especially n < 100)
- // - Out-of-order mode (composite estimator)
- // - Variation across implementations
+ // * Small sample sizes (especially n < 100)
+ // * Out-of-order mode (composite estimator)
+ // * Variation across implementations
if expected > 0.0 {
let error_margin = 0.02; // 2% error margin
let lower_bound = expected * (1.0 - error_margin);
diff --git a/datasketches/tests/hll_union_test.rs
b/datasketches/tests/hll_union_test.rs
index 2f17a29..91080bf 100644
--- a/datasketches/tests/hll_union_test.rs
+++ b/datasketches/tests/hll_union_test.rs
@@ -18,12 +18,12 @@
//! HyperLogLog Union Integration Tests
//!
//! These tests verify the public API behavior of HllUnion, focusing on:
-//! - Basic union operations
-//! - Mode transitions and mixed-mode unions
-//! - Different HLL types and lg_k values
-//! - Bounds and statistical properties
-//! - Mathematical properties (commutativity, associativity, idempotency)
-//! - Reset and reuse patterns
+//! * Basic union operations
+//! * Mode transitions and mixed-mode unions
+//! * Different HLL types and lg_k values
+//! * Bounds and statistical properties
+//! * Mathematical properties (commutativity, associativity, idempotency)
+//! * Reset and reuse patterns
//!
//! This mirrors the testing strategy used in hll_update_test.rs
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]