This is an automated email from the ASF dual-hosted git repository.
leerho pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 9ade42d docs: add rustdoc examples across sketches (#58)
9ade42d is described below
commit 9ade42d639afd013d19c3454a14773b6d54edc25
Author: Chojan Shang <[email protected]>
AuthorDate: Mon Jan 5 07:11:32 2026 +0800
docs: add rustdoc examples across sketches (#58)
* docs: add rustdoc examples across sketches
Signed-off-by: Chojan Shang <[email protected]>
* docs: make docs clean and simple
Signed-off-by: Chojan Shang <[email protected]>
* docs: align examples with Rust conventions
Signed-off-by: Chojan Shang <[email protected]>
---------
Signed-off-by: Chojan Shang <[email protected]>
---
datasketches/src/countmin/mod.rs | 19 ++++
datasketches/src/countmin/sketch.rs | 90 ++++++++++++++++
datasketches/src/error.rs | 10 ++
datasketches/src/frequencies/mod.rs | 24 +++++
datasketches/src/frequencies/sketch.rs | 119 +++++++++++++++++++++
datasketches/src/hll/mod.rs | 31 ++++++
datasketches/src/hll/sketch.rs | 53 ++++++++++
datasketches/src/hll/union.rs | 49 +++++++++
datasketches/src/resize.rs | 9 ++
datasketches/src/tdigest/mod.rs | 12 +++
datasketches/src/tdigest/sketch.rs | 182 +++++++++++++++++++++++++++++++++
datasketches/src/theta/mod.rs | 9 ++
datasketches/src/theta/sketch.rs | 84 +++++++++++++++
13 files changed, 691 insertions(+)
diff --git a/datasketches/src/countmin/mod.rs b/datasketches/src/countmin/mod.rs
index 2be9282..9b427e9 100644
--- a/datasketches/src/countmin/mod.rs
+++ b/datasketches/src/countmin/mod.rs
@@ -19,6 +19,25 @@
//!
//! The Count-Min sketch provides approximate frequency counts for streaming
data
//! with configurable relative error and confidence bounds.
+//!
+//! # Usage
+//!
+//! ```rust
+//! # use datasketches::countmin::CountMinSketch;
+//! let mut sketch = CountMinSketch::new(5, 256);
+//! sketch.update("apple");
+//! sketch.update_with_weight("banana", 3);
+//! assert!(sketch.estimate("banana") >= 3);
+//! ```
+//!
+//! # Configuration Helpers
+//!
+//! ```rust
+//! # use datasketches::countmin::CountMinSketch;
+//! let buckets = CountMinSketch::suggest_num_buckets(0.01);
+//! let hashes = CountMinSketch::suggest_num_hashes(0.99);
+//! let _sketch = CountMinSketch::new(hashes, buckets);
+//! ```
mod serialization;
diff --git a/datasketches/src/countmin/sketch.rs
b/datasketches/src/countmin/sketch.rs
index ca08bff..4f8225b 100644
--- a/datasketches/src/countmin/sketch.rs
+++ b/datasketches/src/countmin/sketch.rs
@@ -54,6 +54,14 @@ impl CountMinSketch {
///
/// Panics if `num_hashes` is 0, `num_buckets` is less than 3, or the
/// total table size exceeds the supported limit.
+ ///
+ /// # Examples
+ ///
+ /// ```rust
+ /// # use datasketches::countmin::CountMinSketch;
+ /// let sketch = CountMinSketch::new(4, 128);
+ /// assert_eq!(sketch.num_buckets(), 128);
+ /// ```
pub fn new(num_hashes: u8, num_buckets: u32) -> Self {
Self::with_seed(num_hashes, num_buckets, DEFAULT_UPDATE_SEED)
}
@@ -64,6 +72,14 @@ impl CountMinSketch {
///
/// Panics if `num_hashes` is 0, `num_buckets` is less than 3, or the
/// total table size exceeds the supported limit.
+ ///
+ /// # Examples
+ ///
+ /// ```rust
+ /// # use datasketches::countmin::CountMinSketch;
+ /// let sketch = CountMinSketch::with_seed(4, 64, 42);
+ /// assert_eq!(sketch.seed(), 42);
+ /// ```
pub fn with_seed(num_hashes: u8, num_buckets: u32, seed: u64) -> Self {
let entries = entries_for_config(num_hashes, num_buckets);
Self::make(num_hashes, num_buckets, seed, entries)
@@ -127,11 +143,29 @@ impl CountMinSketch {
}
/// Updates the sketch with a single occurrence of the item.
+ ///
+ /// # Examples
+ ///
+ /// ```rust
+ /// # use datasketches::countmin::CountMinSketch;
+ /// let mut sketch = CountMinSketch::new(4, 128);
+ /// sketch.update("apple");
+ /// assert!(sketch.estimate("apple") >= 1);
+ /// ```
pub fn update<T: Hash>(&mut self, item: T) {
self.update_with_weight(item, 1);
}
/// Updates the sketch with the given item and weight.
+ ///
+ /// # Examples
+ ///
+ /// ```rust
+ /// # use datasketches::countmin::CountMinSketch;
+ /// let mut sketch = CountMinSketch::new(4, 128);
+ /// sketch.update_with_weight("banana", 3);
+ /// assert!(sketch.estimate("banana") >= 3);
+ /// ```
pub fn update_with_weight<T: Hash>(&mut self, item: T, weight: i64) {
if weight == 0 {
return;
@@ -147,6 +181,15 @@ impl CountMinSketch {
}
/// Returns the estimated frequency of the given item.
+ ///
+ /// # Examples
+ ///
+ /// ```rust
+ /// # use datasketches::countmin::CountMinSketch;
+ /// let mut sketch = CountMinSketch::new(4, 128);
+ /// sketch.update_with_weight("pear", 2);
+ /// assert!(sketch.estimate("pear") >= 2);
+ /// ```
pub fn estimate<T: Hash>(&self, item: T) -> i64 {
let num_buckets = self.num_buckets as usize;
let mut min = i64::MAX;
@@ -178,6 +221,20 @@ impl CountMinSketch {
/// # Panics
///
/// Panics if the sketches have incompatible configurations.
+ ///
+ /// # Examples
+ ///
+ /// ```rust
+ /// # use datasketches::countmin::CountMinSketch;
+ /// let mut left = CountMinSketch::new(4, 128);
+ /// let mut right = CountMinSketch::new(4, 128);
+ ///
+ /// left.update("apple");
+ /// right.update_with_weight("banana", 2);
+ ///
+ /// left.merge(&right);
+ /// assert!(left.estimate("banana") >= 2);
+ /// ```
pub fn merge(&mut self, other: &CountMinSketch) {
if std::ptr::eq(self, other) {
panic!("Cannot merge a sketch with itself.");
@@ -195,6 +252,17 @@ impl CountMinSketch {
}
/// Serializes this sketch into the DataSketches Count-Min format.
+ ///
+ /// # Examples
+ ///
+ /// ```rust
+ /// # use datasketches::countmin::CountMinSketch;
+ /// # let mut sketch = CountMinSketch::new(4, 128);
+ /// # sketch.update("apple");
+ /// let bytes = sketch.serialize();
+ /// let decoded = CountMinSketch::deserialize(&bytes).unwrap();
+ /// assert!(decoded.estimate("apple") >= 1);
+ /// ```
pub fn serialize(&self) -> Vec<u8> {
let header_size = PREAMBLE_LONGS_SHORT as usize * LONG_SIZE_BYTES;
let payload_size = if self.is_empty() {
@@ -227,11 +295,33 @@ impl CountMinSketch {
}
/// Deserializes a sketch from bytes using the default seed.
+ ///
+ /// # Examples
+ ///
+ /// ```rust
+ /// # use datasketches::countmin::CountMinSketch;
+ /// # let mut sketch = CountMinSketch::new(4, 64);
+ /// # sketch.update("apple");
+ /// # let bytes = sketch.serialize();
+ /// let decoded = CountMinSketch::deserialize(&bytes).unwrap();
+ /// assert!(decoded.estimate("apple") >= 1);
+ /// ```
pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
Self::deserialize_with_seed(bytes, DEFAULT_UPDATE_SEED)
}
/// Deserializes a sketch from bytes using the provided seed.
+ ///
+ /// # Examples
+ ///
+ /// ```rust
+ /// # use datasketches::countmin::CountMinSketch;
+ /// # let mut sketch = CountMinSketch::with_seed(4, 64, 7);
+ /// # sketch.update("apple");
+ /// # let bytes = sketch.serialize();
+ /// let decoded = CountMinSketch::deserialize_with_seed(&bytes,
7).unwrap();
+ /// assert!(decoded.estimate("apple") >= 1);
+ /// ```
pub fn deserialize_with_seed(bytes: &[u8], seed: u64) -> Result<Self,
Error> {
fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) ->
Error {
move |_| Error::insufficient_data(tag)
diff --git a/datasketches/src/error.rs b/datasketches/src/error.rs
index 624ee0a..e756f2b 100644
--- a/datasketches/src/error.rs
+++ b/datasketches/src/error.rs
@@ -46,6 +46,16 @@ impl fmt::Display for ErrorKind {
}
/// Error is the error struct returned by all datasketches functions.
+///
+/// # Examples
+///
+/// ```
+/// # use datasketches::error::Error;
+/// # use datasketches::error::ErrorKind;
+/// let err = Error::new(ErrorKind::InvalidArgument, "bad input");
+/// assert_eq!(err.kind(), ErrorKind::InvalidArgument);
+/// assert_eq!(err.message(), "bad input");
+/// ```
pub struct Error {
kind: ErrorKind,
message: String,
diff --git a/datasketches/src/frequencies/mod.rs
b/datasketches/src/frequencies/mod.rs
index e461b61..93fb5e4 100644
--- a/datasketches/src/frequencies/mod.rs
+++ b/datasketches/src/frequencies/mod.rs
@@ -23,6 +23,30 @@
//!
//! For background, see the Java documentation:
//!
<https://apache.github.io/datasketches-java/9.0.0/org/apache/datasketches/frequencies/FrequentItemsSketch.html>
+//!
+//! # Usage
+//!
+//! ```rust
+//! # use datasketches::frequencies::ErrorType;
+//! # use datasketches::frequencies::FrequentItemsSketch;
+//! let mut sketch = FrequentItemsSketch::<i64>::new(64);
+//! sketch.update_with_count(1, 3);
+//! sketch.update(2);
+//! let rows = sketch.frequent_items(ErrorType::NoFalseNegatives);
+//! assert!(rows.iter().any(|row| *row.item() == 1));
+//! ```
+//!
+//! # Serialization
+//!
+//! ```rust
+//! # use datasketches::frequencies::FrequentItemsSketch;
+//! let mut sketch = FrequentItemsSketch::<i64>::new(64);
+//! sketch.update_with_count(42, 2);
+//!
+//! let bytes = sketch.serialize();
+//! let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
+//! assert!(decoded.estimate(&42) >= 2);
+//! ```
mod reverse_purge_item_hash_map;
mod serialization;
diff --git a/datasketches/src/frequencies/sketch.rs
b/datasketches/src/frequencies/sketch.rs
index 28f3325..f399445 100644
--- a/datasketches/src/frequencies/sketch.rs
+++ b/datasketches/src/frequencies/sketch.rs
@@ -104,6 +104,16 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
/// # Panics
///
/// Panics if `max_map_size` is not a power of two.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::frequencies::FrequentItemsSketch;
+ /// let mut sketch = FrequentItemsSketch::<i64>::new(64);
+ /// sketch.update(1);
+ /// sketch.update(2);
+ /// assert_eq!(sketch.num_active_items(), 2);
+ /// ```
pub fn new(max_map_size: usize) -> Self {
let lg_max_map_size = exact_log2(max_map_size);
Self::with_lg_map_sizes(lg_max_map_size, LG_MIN_MAP_SIZE)
@@ -129,6 +139,15 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
/// Returns the estimated frequency for an item.
///
/// If the item is tracked, this is `item_count + offset`. Otherwise it is
zero.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::frequencies::FrequentItemsSketch;
+ /// let mut sketch = FrequentItemsSketch::<i64>::new(64);
+ /// sketch.update_with_count(10, 2);
+ /// assert!(sketch.estimate(&10) >= 2);
+ /// ```
pub fn estimate(&self, item: &T) -> u64 {
let value = self.hash_map.get(item);
if value > 0 { value + self.offset } else { 0 }
@@ -199,6 +218,15 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
}
/// Updates the sketch with a count of one.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::frequencies::FrequentItemsSketch;
+ /// let mut sketch = FrequentItemsSketch::<i64>::new(64);
+ /// sketch.update(42);
+ /// assert!(sketch.estimate(&42) >= 1);
+ /// ```
pub fn update(&mut self, item: T) {
self.update_with_count(item, 1);
}
@@ -206,6 +234,15 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
/// Updates the sketch with an item and count.
///
/// A count of zero is a no-op.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::frequencies::FrequentItemsSketch;
+ /// let mut sketch = FrequentItemsSketch::<i64>::new(64);
+ /// sketch.update_with_count(10, 3);
+ /// assert!(sketch.estimate(&10) >= 3);
+ /// ```
pub fn update_with_count(&mut self, item: T, count: u64) {
if count == 0 {
return;
@@ -220,6 +257,18 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
///
/// The other sketch may have a different map size. The merged sketch
respects the
/// larger error tolerance of the inputs.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::frequencies::FrequentItemsSketch;
+ /// let mut left = FrequentItemsSketch::<i64>::new(64);
+ /// let mut right = FrequentItemsSketch::<i64>::new(64);
+ /// left.update(1);
+ /// right.update_with_count(2, 2);
+ /// left.merge(&right);
+ /// assert!(left.estimate(&2) >= 2);
+ /// ```
pub fn merge(&mut self, other: &Self)
where
T: Clone,
@@ -243,6 +292,18 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
/// Returns frequent items using the sketch maximum error as threshold.
///
/// This is equivalent to
`frequent_items_with_threshold(self.maximum_error(), error_type)`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::frequencies::ErrorType;
+ /// # use datasketches::frequencies::FrequentItemsSketch;
+ /// let mut sketch = FrequentItemsSketch::<i64>::new(64);
+ /// sketch.update_with_count(1, 5);
+ /// sketch.update(2);
+ /// let rows = sketch.frequent_items(ErrorType::NoFalseNegatives);
+ /// assert!(rows.iter().any(|row| *row.item() == 1));
+ /// ```
pub fn frequent_items(&self, error_type: ErrorType) -> Vec<Row<T>>
where
T: Clone,
@@ -256,6 +317,18 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
///
/// For [`ErrorType::NoFalseNegatives`], items are included when
`upper_bound > threshold`.
/// For [`ErrorType::NoFalsePositives`], items are included when
`lower_bound > threshold`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::frequencies::ErrorType;
+ /// # use datasketches::frequencies::FrequentItemsSketch;
+ /// let mut sketch = FrequentItemsSketch::<i64>::new(64);
+ /// sketch.update_with_count(1, 5);
+ /// sketch.update(2);
+ /// let rows =
sketch.frequent_items_with_threshold(ErrorType::NoFalsePositives, 3);
+ /// assert!(rows.iter().any(|row| *row.item() == 1));
+ /// ```
pub fn frequent_items_with_threshold(
&self,
error_type: ErrorType,
@@ -459,11 +532,33 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
impl FrequentItemsSketch<i64> {
/// Serializes this sketch into a byte vector.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::frequencies::FrequentItemsSketch;
+ /// # let mut sketch = FrequentItemsSketch::<i64>::new(64);
+ /// # sketch.update_with_count(7, 2);
+ /// let bytes = sketch.serialize();
+ /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
+ /// assert!(decoded.estimate(&7) >= 2);
+ /// ```
pub fn serialize(&self) -> Vec<u8> {
self.serialize_inner(count_i64_items_bytes, serialize_i64_items)
}
/// Deserializes a sketch from bytes.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::frequencies::FrequentItemsSketch;
+ /// # let mut sketch = FrequentItemsSketch::<i64>::new(64);
+ /// # sketch.update_with_count(7, 2);
+ /// # let bytes = sketch.serialize();
+ /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
+ /// assert!(decoded.estimate(&7) >= 2);
+ /// ```
pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
Self::deserialize_inner(bytes, deserialize_i64_items)
}
@@ -471,11 +566,35 @@ impl FrequentItemsSketch<i64> {
impl FrequentItemsSketch<String> {
/// Serializes this sketch into a byte vector.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::frequencies::FrequentItemsSketch;
+ /// # let mut sketch = FrequentItemsSketch::<String>::new(64);
+ /// # let apple = "apple".to_string();
+ /// # sketch.update_with_count(apple.clone(), 2);
+ /// let bytes = sketch.serialize();
+ /// let decoded =
FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
+ /// assert!(decoded.estimate(&apple) >= 2);
+ /// ```
pub fn serialize(&self) -> Vec<u8> {
self.serialize_inner(count_string_items_bytes, serialize_string_items)
}
/// Deserializes a sketch from bytes.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::frequencies::FrequentItemsSketch;
+ /// # let mut sketch = FrequentItemsSketch::<String>::new(64);
+ /// # let apple = "apple".to_string();
+ /// # sketch.update_with_count(apple.clone(), 2);
+ /// # let bytes = sketch.serialize();
+ /// let decoded =
FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
+ /// assert!(decoded.estimate(&apple) >= 2);
+ /// ```
pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
Self::deserialize_inner(bytes, deserialize_string_items)
}
diff --git a/datasketches/src/hll/mod.rs b/datasketches/src/hll/mod.rs
index 9bf2e7d..da61ccc 100644
--- a/datasketches/src/hll/mod.rs
+++ b/datasketches/src/hll/mod.rs
@@ -71,6 +71,37 @@
//!
//! The serialization format is compatible with Apache DataSketches
implementations
//! in Java and C++, enabling cross-platform sketch exchange.
+//!
+//! # Usage
+//!
+//! ```rust
+//! # use datasketches::hll::HllSketch;
+//! # use datasketches::hll::HllType;
+//! # use datasketches::hll::NumStdDev;
+//! let mut sketch = HllSketch::new(12, HllType::Hll8);
+//! sketch.update("apple");
+//! let upper = sketch.upper_bound(NumStdDev::Two);
+//! assert!(upper >= sketch.estimate());
+//! ```
+//!
+//! # Union
+//!
+//! ```rust
+//! # use datasketches::hll::HllSketch;
+//! # use datasketches::hll::HllType;
+//! # use datasketches::hll::HllUnion;
+//! let mut left = HllSketch::new(10, HllType::Hll8);
+//! let mut right = HllSketch::new(10, HllType::Hll8);
+//! left.update("apple");
+//! right.update("banana");
+//!
+//! let mut union = HllUnion::new(10);
+//! union.update(&left);
+//! union.update(&right);
+//!
+//! let result = union.get_result(HllType::Hll8);
+//! assert!(result.estimate() >= 2.0);
+//! ```
use std::hash::Hash;
diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs
index 64626cd..7fb6c79 100644
--- a/datasketches/src/hll/sketch.rs
+++ b/datasketches/src/hll/sketch.rs
@@ -61,6 +61,15 @@ impl HllSketch {
/// # Panics
///
/// If lg_config_k is not in range [4, 21]
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::hll::HllSketch;
+ /// # use datasketches::hll::HllType;
+ /// let sketch = HllSketch::new(12, HllType::Hll8);
+ /// assert_eq!(sketch.lg_config_k(), 12);
+ /// ```
pub fn new(lg_config_k: u8, hll_type: HllType) -> Self {
assert!(
(4..=21).contains(&lg_config_k),
@@ -134,6 +143,16 @@ impl HllSketch {
///
/// This accepts any type that implements `Hash`. The value is hashed
/// and converted to a coupon, which is then inserted into the sketch.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::hll::HllSketch;
+ /// # use datasketches::hll::HllType;
+ /// let mut sketch = HllSketch::new(10, HllType::Hll8);
+ /// sketch.update("apple");
+ /// assert!(sketch.estimate() >= 1.0);
+ /// ```
pub fn update<T: Hash>(&mut self, value: T) {
let coupon = coupon(value);
self.update_with_coupon(coupon);
@@ -174,6 +193,16 @@ impl HllSketch {
}
/// Get the current cardinality estimate
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::hll::HllSketch;
+ /// # use datasketches::hll::HllType;
+ /// let mut sketch = HllSketch::new(10, HllType::Hll8);
+ /// sketch.update("apple");
+ /// assert!(sketch.estimate() >= 1.0);
+ /// ```
pub fn estimate(&self) -> f64 {
match &self.mode {
Mode::List { list, .. } => list.container().estimate(),
@@ -213,6 +242,18 @@ impl HllSketch {
}
/// Deserializes an HLL sketch from bytes
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::hll::HllSketch;
+ /// # use datasketches::hll::HllType;
+ /// # let mut sketch = HllSketch::new(10, HllType::Hll8);
+ /// # sketch.update("apple");
+ /// # let bytes = sketch.serialize();
+ /// let decoded = HllSketch::deserialize(&bytes).unwrap();
+ /// assert!(decoded.estimate() >= 1.0);
+ /// ```
pub fn deserialize(bytes: &[u8]) -> Result<HllSketch, Error> {
fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) ->
Error {
move |_| Error::insufficient_data(tag)
@@ -323,6 +364,18 @@ impl HllSketch {
}
/// Serializes the HLL sketch to bytes
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::hll::HllSketch;
+ /// # use datasketches::hll::HllType;
+ /// # let mut sketch = HllSketch::new(10, HllType::Hll8);
+ /// # sketch.update("apple");
+ /// let bytes = sketch.serialize();
+ /// let decoded = HllSketch::deserialize(&bytes).unwrap();
+ /// assert!(decoded.estimate() >= 1.0);
+ /// ```
pub fn serialize(&self) -> Vec<u8> {
match &self.mode {
Mode::List { list, hll_type } => list.serialize(self.lg_config_k,
*hll_type),
diff --git a/datasketches/src/hll/union.rs b/datasketches/src/hll/union.rs
index 1d6c215..7946972 100644
--- a/datasketches/src/hll/union.rs
+++ b/datasketches/src/hll/union.rs
@@ -66,6 +66,16 @@ impl HllUnion {
/// # Panics
///
/// Panics if `lg_max_k` is not in the range [4, 21].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::hll::HllType;
+ /// # use datasketches::hll::HllUnion;
+ /// let mut union = HllUnion::new(10);
+ /// union.update_value("apple");
+ /// let _result = union.get_result(HllType::Hll8);
+ /// ```
pub fn new(lg_max_k: u8) -> Self {
assert!(
(4..=21).contains(&lg_max_k),
@@ -83,6 +93,16 @@ impl HllUnion {
///
/// This accepts any type that implements `Hash`. The value is hashed
/// and converted to a coupon, which is then inserted into the sketch.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::hll::HllType;
+ /// # use datasketches::hll::HllUnion;
+ /// let mut union = HllUnion::new(10);
+ /// union.update_value("apple");
+ /// let _result = union.get_result(HllType::Hll8);
+ /// ```
pub fn update_value<T: Hash>(&mut self, value: T) {
self.gadget.update(value);
}
@@ -93,6 +113,24 @@ impl HllUnion {
/// - Sketches with different lg_k values (resizes/downsamples as needed)
/// - Sketches in different modes (List, Set, Array4/6/8)
/// - Sketches with different target HLL types
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::hll::HllSketch;
+ /// # use datasketches::hll::HllType;
+ /// # use datasketches::hll::HllUnion;
+ /// let mut left = HllSketch::new(10, HllType::Hll8);
+ /// let mut right = HllSketch::new(10, HllType::Hll8);
+ /// left.update("apple");
+ /// right.update("banana");
+ ///
+ /// let mut union = HllUnion::new(10);
+ /// union.update(&left);
+ /// union.update(&right);
+ /// let result = union.get_result(HllType::Hll8);
+ /// assert!(result.estimate() >= 2.0);
+ /// ```
pub fn update(&mut self, sketch: &HllSketch) {
if sketch.is_empty() {
return;
@@ -207,6 +245,17 @@ impl HllUnion {
/// # Arguments
///
/// * `hll_type` - The target HLL type for the result sketch (Hll4, Hll6,
or Hll8)
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::hll::HllType;
+ /// # use datasketches::hll::HllUnion;
+ /// let mut union = HllUnion::new(10);
+ /// union.update_value("apple");
+ /// let result = union.get_result(HllType::Hll6);
+ /// assert!(result.estimate() >= 1.0);
+ /// ```
pub fn get_result(&self, hll_type: HllType) -> HllSketch {
let gadget_type = self.gadget.target_type();
diff --git a/datasketches/src/resize.rs b/datasketches/src/resize.rs
index caf87ab..1255bd7 100644
--- a/datasketches/src/resize.rs
+++ b/datasketches/src/resize.rs
@@ -32,6 +32,15 @@
/// reached.
///
/// Similarly, "X4" is a factor of 4 and "X8" is a factor of 8.
+///
+/// # Examples
+///
+/// ```
+/// # use datasketches::ResizeFactor;
+/// let factor = ResizeFactor::X4;
+/// assert_eq!(factor.value(), 4);
+/// assert_eq!(factor.lg_value(), 2);
+/// ```
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ResizeFactor {
/// Do not resize. Sketch will be configured to full size.
diff --git a/datasketches/src/tdigest/mod.rs b/datasketches/src/tdigest/mod.rs
index ad9ca42..d1a80c5 100644
--- a/datasketches/src/tdigest/mod.rs
+++ b/datasketches/src/tdigest/mod.rs
@@ -47,6 +47,18 @@
//! [Datasketches page on
t-digest](https://datasketches.apache.org/docs/tdigest/tdigest.html).
//!
//! [paper]: https://arxiv.org/abs/1902.04023
+//!
+//! # Usage
+//!
+//! ```rust
+//! # use datasketches::tdigest::TDigestMut;
+//! let mut sketch = TDigestMut::new(100);
+//! sketch.update(1.0);
+//! sketch.update(2.0);
+//! let median = sketch.quantile(0.5).unwrap();
+//! let frozen = sketch.freeze();
+//! assert!(frozen.rank(2.0).is_some());
+//! ```
mod serialization;
diff --git a/datasketches/src/tdigest/sketch.rs
b/datasketches/src/tdigest/sketch.rs
index ddf440f..037953d 100644
--- a/datasketches/src/tdigest/sketch.rs
+++ b/datasketches/src/tdigest/sketch.rs
@@ -63,6 +63,14 @@ impl TDigestMut {
/// # Panics
///
/// Panics if k is less than 10
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// let sketch = TDigestMut::new(100);
+ /// assert_eq!(sketch.k(), 100);
+ /// ```
pub fn new(k: u16) -> Self {
Self::make(
k,
@@ -82,6 +90,14 @@ impl TDigestMut {
/// # Errors
///
/// If k is less than 10, returns [`ErrorKind::InvalidArgument`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// let sketch = TDigestMut::try_new(20).unwrap();
+ /// assert_eq!(sketch.k(), 20);
+ /// ```
pub fn try_new(k: u16) -> Result<Self, Error> {
if k < 10 {
return Err(Error::new(
@@ -134,6 +150,15 @@ impl TDigestMut {
/// Update this TDigest with the given value.
///
/// [f64::NAN], [f64::INFINITY], and [f64::NEG_INFINITY] values are
ignored.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// let mut sketch = TDigestMut::new(100);
+ /// sketch.update(1.0);
+ /// assert!(sketch.total_weight() >= 1);
+ /// ```
pub fn update(&mut self, value: f64) {
if value.is_nan() || value.is_infinite() {
return;
@@ -182,6 +207,18 @@ impl TDigestMut {
}
/// Merge the given TDigest into this one
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// let mut left = TDigestMut::new(100);
+ /// let mut right = TDigestMut::new(100);
+ /// left.update(1.0);
+ /// right.update(2.0);
+ /// left.merge(&right);
+ /// assert_eq!(left.total_weight(), 2);
+ /// ```
pub fn merge(&mut self, other: &TDigestMut) {
if other.is_empty() {
return;
@@ -209,6 +246,16 @@ impl TDigestMut {
}
/// Freezes this TDigest into an immutable one.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// let mut sketch = TDigestMut::new(100);
+ /// sketch.update(1.0);
+ /// let frozen = sketch.freeze();
+ /// assert!(!frozen.is_empty());
+ /// ```
pub fn freeze(mut self) -> TDigest {
self.compress();
TDigest {
@@ -232,6 +279,18 @@ impl TDigestMut {
}
/// See [`TDigest::cdf`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// # let mut sketch = TDigestMut::new(100);
+ /// # for value in [1.0, 2.0, 3.0] {
+ /// # sketch.update(value);
+ /// # }
+ /// let cdf = sketch.cdf(&[1.5]).unwrap();
+ /// assert_eq!(cdf.len(), 2);
+ /// ```
pub fn cdf(&mut self, split_points: &[f64]) -> Option<Vec<f64>> {
check_split_points(split_points);
@@ -243,6 +302,18 @@ impl TDigestMut {
}
/// See [`TDigest::pmf`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// # let mut sketch = TDigestMut::new(100);
+ /// # for value in [1.0, 2.0, 3.0] {
+ /// # sketch.update(value);
+ /// # }
+ /// let pmf = sketch.pmf(&[1.5]).unwrap();
+ /// assert_eq!(pmf.len(), 2);
+ /// ```
pub fn pmf(&mut self, split_points: &[f64]) -> Option<Vec<f64>> {
check_split_points(split_points);
@@ -254,6 +325,18 @@ impl TDigestMut {
}
/// See [`TDigest::rank`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// # let mut sketch = TDigestMut::new(100);
+ /// # for value in [1.0, 2.0, 3.0] {
+ /// # sketch.update(value);
+ /// # }
+ /// let rank = sketch.rank(2.0).unwrap();
+ /// assert!((0.0..=1.0).contains(&rank));
+ /// ```
pub fn rank(&mut self, value: f64) -> Option<f64> {
assert!(!value.is_nan(), "value must not be NaN");
@@ -275,6 +358,18 @@ impl TDigestMut {
}
/// See [`TDigest::quantile`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// # let mut sketch = TDigestMut::new(100);
+ /// # for value in [1.0, 2.0, 3.0] {
+ /// # sketch.update(value);
+ /// # }
+ /// let median = sketch.quantile(0.5).unwrap();
+ /// assert!((1.0..=3.0).contains(&median));
+ /// ```
pub fn quantile(&mut self, rank: f64) -> Option<f64> {
assert!((0.0..=1.0).contains(&rank), "rank must be in [0.0, 1.0]");
@@ -286,6 +381,17 @@ impl TDigestMut {
}
/// Serializes this TDigest to bytes.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// # let mut sketch = TDigestMut::new(100);
+ /// # sketch.update(1.0);
+ /// let bytes = sketch.serialize();
+ /// let decoded = TDigestMut::deserialize(&bytes, false).unwrap();
+ /// assert_eq!(decoded.max_value(), Some(1.0));
+ /// ```
pub fn serialize(&mut self) -> Vec<u8> {
self.compress();
@@ -367,6 +473,18 @@ impl TDigestMut {
///
/// [^1]: This is to support reading the `tdigest<float>` format from the
C++ implementation.
/// [^2]: <https://github.com/tdunning/t-digest>
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// # let mut sketch = TDigestMut::new(100);
+ /// # sketch.update(1.0);
+ /// # sketch.update(2.0);
+ /// # let bytes = sketch.serialize();
+ /// let decoded = TDigestMut::deserialize(&bytes, false).unwrap();
+ /// assert_eq!(decoded.max_value(), Some(2.0));
+ /// ```
pub fn deserialize(bytes: &[u8], is_f32: bool) -> Result<Self, Error> {
fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) ->
Error {
move |_| Error::insufficient_data(tag)
@@ -747,6 +865,19 @@ impl TDigest {
///
/// Panics if `split_points` is not unique, not monotonically increasing,
or contains `NaN`
/// values.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// # let mut sketch = TDigestMut::new(100);
+ /// # for value in [1.0, 2.0, 3.0] {
+ /// # sketch.update(value);
+ /// # }
+ /// let digest = sketch.freeze();
+ /// let cdf = digest.cdf(&[1.5]).unwrap();
+ /// assert_eq!(cdf.len(), 2);
+ /// ```
pub fn cdf(&self, split_points: &[f64]) -> Option<Vec<f64>> {
self.view().cdf(split_points)
}
@@ -770,6 +901,19 @@ impl TDigest {
///
/// Panics if `split_points` is not unique, not monotonically increasing,
or contains `NaN`
/// values.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// # let mut sketch = TDigestMut::new(100);
+ /// # for value in [1.0, 2.0, 3.0] {
+ /// # sketch.update(value);
+ /// # }
+ /// let digest = sketch.freeze();
+ /// let pmf = digest.pmf(&[1.5]).unwrap();
+ /// assert_eq!(pmf.len(), 2);
+ /// ```
pub fn pmf(&self, split_points: &[f64]) -> Option<Vec<f64>> {
self.view().pmf(split_points)
}
@@ -781,6 +925,19 @@ impl TDigest {
/// # Panics
///
/// Panics if the value is `NaN`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// # let mut sketch = TDigestMut::new(100);
+ /// # for value in [1.0, 2.0, 3.0] {
+ /// # sketch.update(value);
+ /// # }
+ /// let digest = sketch.freeze();
+ /// let rank = digest.rank(2.0).unwrap();
+ /// assert!((0.0..=1.0).contains(&rank));
+ /// ```
pub fn rank(&self, value: f64) -> Option<f64> {
assert!(!value.is_nan(), "value must not be NaN");
self.view().rank(value)
@@ -793,12 +950,37 @@ impl TDigest {
/// # Panics
///
/// Panics if rank is not in [0.0, 1.0].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// # let mut sketch = TDigestMut::new(100);
+ /// # for value in [1.0, 2.0, 3.0] {
+ /// # sketch.update(value);
+ /// # }
+ /// let digest = sketch.freeze();
+ /// let q = digest.quantile(0.5).unwrap();
+ /// assert!((1.0..=3.0).contains(&q));
+ /// ```
pub fn quantile(&self, rank: f64) -> Option<f64> {
assert!((0.0..=1.0).contains(&rank), "rank must be in [0.0, 1.0]");
self.view().quantile(rank)
}
/// Converts this immutable TDigest into a mutable one.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::tdigest::TDigestMut;
+ /// # let mut sketch = TDigestMut::new(100);
+ /// # sketch.update(1.0);
+ /// # let digest = sketch.freeze();
+ /// let mut mutable = digest.unfreeze();
+ /// mutable.update(2.0);
+ /// assert_eq!(mutable.total_weight(), 2);
+ /// ```
pub fn unfreeze(self) -> TDigestMut {
TDigestMut::make(
self.k,
diff --git a/datasketches/src/theta/mod.rs b/datasketches/src/theta/mod.rs
index 0d50348..ccaac52 100644
--- a/datasketches/src/theta/mod.rs
+++ b/datasketches/src/theta/mod.rs
@@ -28,6 +28,15 @@
//! configurable accuracy and memory usage. The implementation supports:
//!
//! - **ThetaSketch**: Mutable sketch for building from input data
+//!
+//! # Usage
+//!
+//! ```rust
+//! # use datasketches::theta::ThetaSketch;
+//! let mut sketch = ThetaSketch::builder().build();
+//! sketch.update("apple");
+//! assert!(sketch.estimate() >= 1.0);
+//! ```
mod hash_table;
mod sketch;
diff --git a/datasketches/src/theta/sketch.rs b/datasketches/src/theta/sketch.rs
index 0ad5357..0baa0f7 100644
--- a/datasketches/src/theta/sketch.rs
+++ b/datasketches/src/theta/sketch.rs
@@ -38,11 +38,28 @@ pub struct ThetaSketch {
impl ThetaSketch {
/// Create a new builder for ThetaSketch
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::theta::ThetaSketch;
+ /// let sketch = ThetaSketch::builder().lg_k(12).build();
+ /// assert_eq!(sketch.lg_k(), 12);
+ /// ```
pub fn builder() -> ThetaSketchBuilder {
ThetaSketchBuilder::default()
}
/// Update the sketch with a hashable value
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::theta::ThetaSketch;
+ /// let mut sketch = ThetaSketch::builder().build();
+ /// sketch.update("apple");
+ /// assert!(sketch.estimate() >= 1.0);
+ /// ```
pub fn update<T: Hash>(&mut self, value: T) {
let hash = self.table.hash_and_screen(value);
if hash != 0 {
@@ -51,6 +68,15 @@ impl ThetaSketch {
}
/// Update the sketch with a f64 value
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::theta::ThetaSketch;
+ /// let mut sketch = ThetaSketch::builder().build();
+ /// sketch.update_f64(1.0);
+ /// assert!(sketch.estimate() >= 1.0);
+ /// ```
pub fn update_f64(&mut self, value: f64) {
// Canonicalize double for compatibility with Java
let canonical = canonical_double(value);
@@ -58,11 +84,29 @@ impl ThetaSketch {
}
/// Update the sketch with a f32 value
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::theta::ThetaSketch;
+ /// let mut sketch = ThetaSketch::builder().build();
+ /// sketch.update_f32(1.0);
+ /// assert!(sketch.estimate() >= 1.0);
+ /// ```
pub fn update_f32(&mut self, value: f32) {
self.update_f64(value as f64);
}
/// Return cardinality estimate
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::theta::ThetaSketch;
+ /// # let mut sketch = ThetaSketch::builder().build();
+ /// # sketch.update("apple");
+ /// assert!(sketch.estimate() >= 1.0);
+ /// ```
pub fn estimate(&self) -> f64 {
if self.is_empty() {
return 0.0;
@@ -113,6 +157,16 @@ impl ThetaSketch {
}
/// Return iterator over hash values
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::theta::ThetaSketch;
+ /// # let mut sketch = ThetaSketch::builder().build();
+ /// # sketch.update("apple");
+ /// let mut iter = sketch.iter();
+ /// assert!(iter.next().is_some());
+ /// ```
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
self.table.iter()
}
@@ -144,6 +198,14 @@ impl ThetaSketchBuilder {
/// # Panics
///
/// If lg_k is not in range [5, 26]
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::theta::ThetaSketch;
+ /// let sketch = ThetaSketch::builder().lg_k(12).build();
+ /// assert_eq!(sketch.lg_k(), 12);
+ /// ```
pub fn lg_k(mut self, lg_k: u8) -> Self {
assert!(
(MIN_LG_K..=MAX_LG_K).contains(&lg_k),
@@ -167,6 +229,13 @@ impl ThetaSketchBuilder {
/// # Panics
///
/// If p is not in range [0.0, 1.0]
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::theta::ThetaSketch;
+ /// let _sketch = ThetaSketch::builder().sampling_probability(0.5).build();
+ /// ```
pub fn sampling_probability(mut self, probability: f32) -> Self {
assert!(
(0.0..=1.0).contains(&probability),
@@ -177,12 +246,27 @@ impl ThetaSketchBuilder {
}
/// Set hash seed.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::theta::ThetaSketch;
+ /// let _sketch = ThetaSketch::builder().seed(7).build();
+ /// ```
pub fn seed(mut self, seed: u64) -> Self {
self.seed = seed;
self
}
/// Build the ThetaSketch.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::theta::ThetaSketch;
+ /// let sketch = ThetaSketch::builder().lg_k(10).build();
+ /// assert_eq!(sketch.lg_k(), 10);
+ /// ```
pub fn build(self) -> ThetaSketch {
let table = ThetaHashTable::new(
self.lg_k,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]