This is an automated email from the ASF dual-hosted git repository. tison pushed a commit to branch export-frequency-item-sketch in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
commit 2287c4d8f4bf8e3c5699d509038d0932a407ae85 Author: tison <[email protected]> AuthorDate: Thu Feb 19 12:34:38 2026 +0800 refactor: export FrequentItemValue and improve docs Signed-off-by: tison <[email protected]> --- datasketches/src/frequencies/mod.rs | 67 ++++++++++++++++++++++++++++++++----- datasketches/src/hash/mod.rs | 1 - datasketches/src/hll/sketch.rs | 10 +++--- datasketches/src/hll/union.rs | 6 ++-- 4 files changed, 67 insertions(+), 17 deletions(-) diff --git a/datasketches/src/frequencies/mod.rs b/datasketches/src/frequencies/mod.rs index 93fb5e4..a63e72d 100644 --- a/datasketches/src/frequencies/mod.rs +++ b/datasketches/src/frequencies/mod.rs @@ -17,16 +17,66 @@ //! Frequency sketches for finding heavy hitters in data streams. //! -//! This module implements the Frequent Items sketch from Apache DataSketches. It tracks -//! approximate frequencies in a stream and can report heavy hitters with explicit -//! error guarantees (no false negatives or no false positives). +//! # Overview //! -//! For background, see the Java documentation: -//! <https://apache.github.io/datasketches-java/9.0.0/org/apache/datasketches/frequencies/FrequentItemsSketch.html> +//! This sketch is based on the paper ["A High-Performance Algorithm for Identifying Frequent Items +//! in Data Streams"](https://arxiv.org/abs/1705.07001) by Daniel Anderson, Pryce Bevan, Kevin Lang, +//! Edo Liberty, Lee Rhodes, and Justin Thaler. //! -//! # Usage +//! This sketch is useful for tracking approximate frequencies of items of type `T` that implements +//! [`FrequentItemValue`], with optional associated counts (`T` item, `u64` count) that are members +//! of a multiset of such items. The true frequency of an item is defined to be the sum of +//! associated counts. //! -//! ```rust +//! This implementation provides the following capabilities: +//! - Estimate the frequency of an item. +//! - Return upper and lower bounds of any item, such that the true frequency is always between the +//! upper and lower bounds. +//! - Return a global maximum error that holds for all items in the stream. +//! - Return an array of frequent items that qualify either [`ErrorType::NoFalsePositives`] or +//! [`ErrorType::NoFalseNegatives`]. +//! - Merge itself with another sketch created from this module. +//! - Serialize to bytes, or deserialize from bytes, for storage or transmission. +//! +//! # Accuracy +//! +//! If fewer than `0.75 * max_map_size` different items are inserted into the sketch the estimated +//! frequencies returned by the sketch will be exact. +//! +//! The logic of the frequent items sketch is such that the stored counts and true counts are never +//! too different. More specifically, for any item, the sketch can return an estimate of the true +//! frequency of item, along with upper and lower bounds on the frequency (that hold +//! deterministically). +//! +//! For this implementation and for a specific active item, it is guaranteed that the true frequency +//! will be between the Upper Bound (UB) and the Lower Bound (LB) computed for that item. +//! Specifically, `(UB - LB) ≤ W * epsilon`, where `W` denotes the sum of all item counts, and +//! `epsilon = 3.5/M`, where `M` is the `max_map_size`. +//! +//! This is the worst case guarantee that applies to arbitrary inputs. [^1] +//! For inputs typically seen in practice (`UB - LB`) is usually much smaller. +//! +//! [^1]: For speed we do employ some randomization that introduces a small probability that our +//! proof of the worst-case bound might not apply to a given run. However, we have ensured that this +//! probability is extremely small. For example, if the stream causes one table purge (rebuild), +//! our proof of the worst case bound applies with probability at least `1 - 1E-14`. If the stream +//! causes `1E9` purges, our proof applies with probability at least `1 - 1E-5`. +//! +//! # Background +//! +//! This code implements a variant of what is commonly known as the "Misra-Gries algorithm". +//! Variants of it were discovered and rediscovered and redesigned several times over the years: +//! - "Finding repeated elements", Misra, Gries, 1982 +//! - "Frequency estimation of Internet packet streams with limited space" Demaine, Lopez-Ortiz, +//! Munro, 2002 +//! - "A simple algorithm for finding frequent elements in streams and bags" Karp, Shenker, +//! Papadimitriou, 2003 +//! - "Efficient Computation of Frequent and Top-k Elements in Data Streams" Metwally, Agrawal, +//! Abbadi, 2006 +//! +//! # Examples +//! +//! ``` //! # use datasketches::frequencies::ErrorType; //! # use datasketches::frequencies::FrequentItemsSketch; //! let mut sketch = FrequentItemsSketch::<i64>::new(64); @@ -38,7 +88,7 @@ //! //! # Serialization //! -//! ```rust +//! ``` //! # use datasketches::frequencies::FrequentItemsSketch; //! let mut sketch = FrequentItemsSketch::<i64>::new(64); //! sketch.update_with_count(42, 2); @@ -52,6 +102,7 @@ mod reverse_purge_item_hash_map; mod serialization; mod sketch; +pub use self::serialization::FrequentItemValue; pub use self::sketch::ErrorType; pub use self::sketch::FrequentItemsSketch; pub use self::sketch::Row; diff --git a/datasketches/src/hash/mod.rs b/datasketches/src/hash/mod.rs index 87eaf22..99d2cca 100644 --- a/datasketches/src/hash/mod.rs +++ b/datasketches/src/hash/mod.rs @@ -19,7 +19,6 @@ mod murmurhash; mod xxhash; pub(crate) use self::murmurhash::MurmurHash3X64128; -#[allow(unused_imports)] pub(crate) use self::xxhash::XxHash64; /// The seed 9001 used in the sketch update methods is a prime number that was chosen very early diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs index 484e16a..3b8f173 100644 --- a/datasketches/src/hll/sketch.rs +++ b/datasketches/src/hll/sketch.rs @@ -54,15 +54,15 @@ impl HllSketch { /// /// # Arguments /// - /// * `lg_config_k` - Log2 of the number of buckets (K). Must be in [4, 21]. + /// * `lg_config_k`: Log2 of the number of buckets (K). Must be in `[4, 21]`. /// - lg_k=4: 16 buckets, ~26% relative error /// - lg_k=12: 4096 buckets, ~1.6% relative error (common choice) /// - lg_k=21: 2M buckets, ~0.4% relative error - /// * `hll_type` - Target HLL array type (Hll4, Hll6, or Hll8) + /// * `hll_type`: Target HLL array type (Hll4, Hll6, or Hll8) /// /// # Panics /// - /// If lg_config_k is not in range [4, 21] + /// If lg_config_k is not in range `[4, 21]` /// /// # Examples /// @@ -94,8 +94,8 @@ impl HllSketch { /// /// # Arguments /// - /// * `lg_config_k` - Log2 of the number of buckets (K) - /// * `mode` - The mode to initialize the sketch with + /// * `lg_config_k`: Log2 of the number of buckets (K) + /// * `mode`: The mode to initialize the sketch with pub(super) fn from_mode(lg_config_k: u8, mode: Mode) -> Self { Self { lg_config_k, mode } } diff --git a/datasketches/src/hll/union.rs b/datasketches/src/hll/union.rs index 03fb4ea..54b72e5 100644 --- a/datasketches/src/hll/union.rs +++ b/datasketches/src/hll/union.rs @@ -59,13 +59,13 @@ impl HllUnion { /// /// # Arguments /// - /// * `lg_max_k` - Maximum log2 of the number of buckets. Must be in [4, 21]. This determines + /// * `lg_max_k`: Maximum log2 of the number of buckets. Must be in `[4, 21]`. This determines /// the maximum precision the union can handle. Input sketches with larger lg_k will be /// down-sampled. /// /// # Panics /// - /// Panics if `lg_max_k` is not in the range [4, 21]. + /// Panics if `lg_max_k` is not in the range `[4, 21]`. /// /// # Examples /// @@ -244,7 +244,7 @@ impl HllUnion { /// /// # Arguments /// - /// * `hll_type` - The target HLL type for the result sketch (Hll4, Hll6, or Hll8) + /// * `hll_type`: The target HLL type for the result sketch (Hll4, Hll6, or Hll8) /// /// # Examples /// --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
