This is an automated email from the ASF dual-hosted git repository.

tison pushed a commit to branch export-frequency-item-sketch
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git

commit 2287c4d8f4bf8e3c5699d509038d0932a407ae85
Author: tison <[email protected]>
AuthorDate: Thu Feb 19 12:34:38 2026 +0800

    refactor: export FrequentItemValue and improve docs
    
    Signed-off-by: tison <[email protected]>
---
 datasketches/src/frequencies/mod.rs | 67 ++++++++++++++++++++++++++++++++-----
 datasketches/src/hash/mod.rs        |  1 -
 datasketches/src/hll/sketch.rs      | 10 +++---
 datasketches/src/hll/union.rs       |  6 ++--
 4 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/datasketches/src/frequencies/mod.rs 
b/datasketches/src/frequencies/mod.rs
index 93fb5e4..a63e72d 100644
--- a/datasketches/src/frequencies/mod.rs
+++ b/datasketches/src/frequencies/mod.rs
@@ -17,16 +17,66 @@
 
 //! Frequency sketches for finding heavy hitters in data streams.
 //!
-//! This module implements the Frequent Items sketch from Apache DataSketches. 
It tracks
-//! approximate frequencies in a stream and can report heavy hitters with 
explicit
-//! error guarantees (no false negatives or no false positives).
+//! # Overview
 //!
-//! For background, see the Java documentation:
-//! 
<https://apache.github.io/datasketches-java/9.0.0/org/apache/datasketches/frequencies/FrequentItemsSketch.html>
+//! This sketch is based on the paper ["A High-Performance Algorithm for 
Identifying Frequent Items
+//! in Data Streams"](https://arxiv.org/abs/1705.07001) by Daniel Anderson, 
Pryce Bevan, Kevin Lang,
+//! Edo Liberty, Lee Rhodes, and Justin Thaler.
 //!
-//! # Usage
+//! This sketch is useful for tracking approximate frequencies of items of 
type `T` that implements
+//! [`FrequentItemValue`], with optional associated counts (`T` item, `u64` 
count) that are members
+//! of a multiset of such items. The true frequency of an item is defined to 
be the sum of
+//! associated counts.
 //!
-//! ```rust
+//! This implementation provides the following capabilities:
+//! - Estimate the frequency of an item.
+//! - Return upper and lower bounds of any item, such that the true frequency 
is always between the
+//!   upper and lower bounds.
+//! - Return a global maximum error that holds for all items in the stream.
+//! - Return an array of frequent items that qualify either 
[`ErrorType::NoFalsePositives`] or
+//!   [`ErrorType::NoFalseNegatives`].
+//! - Merge itself with another sketch created from this module.
+//! - Serialize to bytes, or deserialize from bytes, for storage or 
transmission.
+//!
+//! # Accuracy
+//!
+//! If fewer than `0.75 * max_map_size` different items are inserted into the 
sketch the estimated
+//! frequencies returned by the sketch will be exact.
+//!
+//! The logic of the frequent items sketch is such that the stored counts and 
true counts are never
+//! too different. More specifically, for any item, the sketch can return an 
estimate of the true
+//! frequency of item, along with upper and lower bounds on the frequency 
(that hold
+//! deterministically).
+//!
+//! For this implementation and for a specific active item, it is guaranteed 
that the true frequency
+//! will be between the Upper Bound (UB) and the Lower Bound (LB) computed for 
that item.
+//! Specifically, `(UB - LB) ≤ W * epsilon`, where `W` denotes the sum of all 
item counts, and
+//! `epsilon = 3.5/M`, where `M` is the `max_map_size`.
+//!
+//! This is the worst case guarantee that applies to arbitrary inputs. [^1]
+//! For inputs typically seen in practice (`UB - LB`) is usually much smaller.
+//!
+//! [^1]: For speed we do employ some randomization that introduces a small 
probability that our
+//! proof of the worst-case bound might not apply to a given run. However, we 
have ensured that this
+//! probability is extremely small. For example, if the stream causes one 
table purge (rebuild),
+//! our proof of the worst case bound applies with probability at least `1 - 
1E-14`. If the stream
+//! causes `1E9` purges, our proof applies with probability at least `1 - 
1E-5`.
+//!
+//! # Background
+//!
+//! This code implements a variant of what is commonly known as the 
"Misra-Gries algorithm".
+//! Variants of it were discovered and rediscovered and redesigned several 
times over the years:
+//! - "Finding repeated elements", Misra, Gries, 1982
+//! - "Frequency estimation of Internet packet streams with limited space" 
Demaine, Lopez-Ortiz,
+//!   Munro, 2002
+//! - "A simple algorithm for finding frequent elements in streams and bags" 
Karp, Shenker,
+//!   Papadimitriou, 2003
+//! - "Efficient Computation of Frequent and Top-k Elements in Data Streams" 
Metwally, Agrawal,
+//!   Abbadi, 2006
+//!
+//! # Examples
+//!
+//! ```
 //! # use datasketches::frequencies::ErrorType;
 //! # use datasketches::frequencies::FrequentItemsSketch;
 //! let mut sketch = FrequentItemsSketch::<i64>::new(64);
@@ -38,7 +88,7 @@
 //!
 //! # Serialization
 //!
-//! ```rust
+//! ```
 //! # use datasketches::frequencies::FrequentItemsSketch;
 //! let mut sketch = FrequentItemsSketch::<i64>::new(64);
 //! sketch.update_with_count(42, 2);
@@ -52,6 +102,7 @@ mod reverse_purge_item_hash_map;
 mod serialization;
 mod sketch;
 
+pub use self::serialization::FrequentItemValue;
 pub use self::sketch::ErrorType;
 pub use self::sketch::FrequentItemsSketch;
 pub use self::sketch::Row;
diff --git a/datasketches/src/hash/mod.rs b/datasketches/src/hash/mod.rs
index 87eaf22..99d2cca 100644
--- a/datasketches/src/hash/mod.rs
+++ b/datasketches/src/hash/mod.rs
@@ -19,7 +19,6 @@ mod murmurhash;
 mod xxhash;
 
 pub(crate) use self::murmurhash::MurmurHash3X64128;
-#[allow(unused_imports)]
 pub(crate) use self::xxhash::XxHash64;
 
 /// The seed 9001 used in the sketch update methods is a prime number that was 
chosen very early
diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs
index 484e16a..3b8f173 100644
--- a/datasketches/src/hll/sketch.rs
+++ b/datasketches/src/hll/sketch.rs
@@ -54,15 +54,15 @@ impl HllSketch {
     ///
     /// # Arguments
     ///
-    /// * `lg_config_k` - Log2 of the number of buckets (K). Must be in [4, 
21].
+    /// * `lg_config_k`: Log2 of the number of buckets (K). Must be in `[4, 
21]`.
     ///   - lg_k=4: 16 buckets, ~26% relative error
     ///   - lg_k=12: 4096 buckets, ~1.6% relative error (common choice)
     ///   - lg_k=21: 2M buckets, ~0.4% relative error
-    /// * `hll_type` - Target HLL array type (Hll4, Hll6, or Hll8)
+    /// * `hll_type`: Target HLL array type (Hll4, Hll6, or Hll8)
     ///
     /// # Panics
     ///
-    /// If lg_config_k is not in range [4, 21]
+    /// If lg_config_k is not in range `[4, 21]`
     ///
     /// # Examples
     ///
@@ -94,8 +94,8 @@ impl HllSketch {
     ///
     /// # Arguments
     ///
-    /// * `lg_config_k` - Log2 of the number of buckets (K)
-    /// * `mode` - The mode to initialize the sketch with
+    /// * `lg_config_k`: Log2 of the number of buckets (K)
+    /// * `mode`: The mode to initialize the sketch with
     pub(super) fn from_mode(lg_config_k: u8, mode: Mode) -> Self {
         Self { lg_config_k, mode }
     }
diff --git a/datasketches/src/hll/union.rs b/datasketches/src/hll/union.rs
index 03fb4ea..54b72e5 100644
--- a/datasketches/src/hll/union.rs
+++ b/datasketches/src/hll/union.rs
@@ -59,13 +59,13 @@ impl HllUnion {
     ///
     /// # Arguments
     ///
-    /// * `lg_max_k` - Maximum log2 of the number of buckets. Must be in [4, 
21]. This determines
+    /// * `lg_max_k`: Maximum log2 of the number of buckets. Must be in `[4, 
21]`. This determines
     ///   the maximum precision the union can handle. Input sketches with 
larger lg_k will be
     ///   down-sampled.
     ///
     /// # Panics
     ///
-    /// Panics if `lg_max_k` is not in the range [4, 21].
+    /// Panics if `lg_max_k` is not in the range `[4, 21]`.
     ///
     /// # Examples
     ///
@@ -244,7 +244,7 @@ impl HllUnion {
     ///
     /// # Arguments
     ///
-    /// * `hll_type` - The target HLL type for the result sketch (Hll4, Hll6, 
or Hll8)
+    /// * `hll_type`: The target HLL type for the result sketch (Hll4, Hll6, 
or Hll8)
     ///
     /// # Examples
     ///


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to