tisonkun commented on code in PR #1:
URL: https://github.com/apache/datasketches-rust/pull/1#discussion_r2616637317


##########
src/hll/sketch.rs:
##########
@@ -0,0 +1,422 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! HyperLogLog sketch implementation
+//!
+//! This module provides the main [`HllSketch`] struct, which is the primary 
interface
+//! for creating and using HLL sketches for cardinality estimation.
+//!
+//! # Adaptive Mode System
+//!
+//! The sketch automatically transitions between three internal modes based on 
cardinality:
+//!
+//! - **List mode**: Stores individual coupons in a compact list for small 
cardinalities.
+//!   Used when fewer than ~32 unique values have been seen.
+//!
+//! - **Set mode**: Uses a hash set with open addressing for medium 
cardinalities.
+//!   Provides better performance than list mode while still being 
space-efficient.
+//!   The set grows dynamically until it reaches K/8 entries.
+//!
+//! - **HLL mode**: Uses the full HLL array (Array4, Array6, or Array8) for 
large cardinalities.
+//!   Provides constant memory usage and accurate estimates for billions of 
unique values.
+//!
+//! Mode transitions are automatic and transparent to the user. Each promotion 
preserves
+//! all previously observed values and maintains estimation accuracy.
+//!
+//! # Serialization
+//!
+//! Sketches can be serialized and deserialized while preserving all state, 
including:
+//! - Current mode and HLL type
+//! - All observed values (coupons or register values)
+//! - HIP accumulator state for accurate estimation
+//! - Out-of-order flag for merged/deserialized sketches
+//!
+//! The serialization format is compatible with Apache DataSketches 
implementations
+//! in Java and C++, enabling cross-platform sketch exchange.
+
+use std::hash::Hash;
+use std::io;
+
+use crate::hll::array4::Array4;
+use crate::hll::array6::Array6;
+use crate::hll::array8::Array8;
+use crate::hll::container::Container;
+use crate::hll::hash_set::HashSet;
+use crate::hll::list::List;
+use crate::hll::serialization::*;
+use crate::hll::{HllType, RESIZE_DENOM, RESIZE_NUMER, coupon};
+
+/// Current sketch mode
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum CurMode {
+    List = 0,
+    Set = 1,
+    Hll = 2,
+}
+
+#[derive(Debug, Clone)]
+pub struct HllSketch {
+    lg_config_k: u8,
+    mode: Mode,
+}
+
+impl PartialEq for HllSketch {
+    fn eq(&self, other: &Self) -> bool {
+        if self.lg_config_k != other.lg_config_k {
+            return false;
+        }
+
+        match (&self.mode, &other.mode) {
+            (
+                Mode::List {
+                    list: l1,
+                    hll_type: t1,
+                },
+                Mode::List {
+                    list: l2,
+                    hll_type: t2,
+                },
+            ) => l1 == l2 && t1 == t2,
+            (
+                Mode::Set {
+                    set: s1,
+                    hll_type: t1,
+                },
+                Mode::Set {
+                    set: s2,
+                    hll_type: t2,
+                },
+            ) => s1 == s2 && t1 == t2,
+            (Mode::Array4(a1), Mode::Array4(a2)) => a1 == a2,
+            (Mode::Array6(a1), Mode::Array6(a2)) => a1 == a2,
+            (Mode::Array8(a1), Mode::Array8(a2)) => a1 == a2,
+            _ => false, // Different modes are not equal
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+enum Mode {
+    List { list: List, hll_type: HllType },
+    Set { set: HashSet, hll_type: HllType },
+    Array4(Array4),
+    Array6(Array6),
+    Array8(Array8),
+}
+
+impl HllSketch {
+    /// Create a new HLL sketch
+    ///
+    /// # Arguments
+    /// * `lg_config_k` - Log2 of the number of buckets (K). Must be in [4, 
21].
+    ///   - lg_k=4: 16 buckets, ~26% relative error
+    ///   - lg_k=12: 4096 buckets, ~1.6% relative error (common choice)
+    ///   - lg_k=21: 2M buckets, ~0.4% relative error
+    /// * `hll_type` - Target HLL array type (Hll4, Hll6, or Hll8)
+    ///
+    /// # Panics
+    /// Panics if lg_config_k is not in [4, 21]
+    pub fn new(lg_config_k: u8, hll_type: HllType) -> Self {
+        assert!(
+            lg_config_k > 4 && lg_config_k < 21,
+            "lg_config_k must be in [4, 21]"
+        );
+
+        let list = List::default();
+
+        Self {
+            lg_config_k,
+            mode: Mode::List { list, hll_type },
+        }
+    }
+
+    pub fn lg_config_k(&self) -> u8 {
+        self.lg_config_k
+    }
+
+    /// Update the sketch with a value
+    ///
+    /// This accepts any type that implements `Hash`. The value is hashed
+    /// and converted to a coupon, which is then inserted into the sketch.
+    /// The sketch will automatically promote from List → Set → HLL as needed.
+    pub fn update<T: Hash>(&mut self, value: T) {

Review Comment:
   "List → Set → HLL" looks like internal impl details. Shall we expose them to 
end users? I'm afraid that users don't know what this sentence means and they 
should not.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to