tisonkun commented on code in PR #1: URL: https://github.com/apache/datasketches-rust/pull/1#discussion_r2616635087
########## src/hll/estimator.rs: ########## @@ -0,0 +1,578 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! HIP (Historical Inverse Probability) Estimator for HyperLogLog +//! +//! The HIP estimator provides improved cardinality estimation by maintaining +//! an accumulator that tracks the historical sequence of register updates. +//! This is more accurate than the standard HLL estimator, especially for +//! moderate cardinalities. + +use crate::hll::{composite_interpolation, cubic_interpolation, harmonic_numbers}; + +/// HIP estimator with KxQ registers for improved cardinality estimation +/// +/// This struct encapsulates all estimation-related state and logic, +/// allowing it to be composed into Array4, Array6, and Array8. +/// +/// The estimator supports two modes: +/// - **In-order mode**: Uses HIP (Historical Inverse Probability) accumulator +/// for accurate sequential updates +/// - **Out-of-order mode**: Uses composite estimator (raw HLL + linear counting) +/// after deserialization or merging +#[derive(Debug, Clone)] +pub struct HipEstimator { + /// HIP estimator accumulator + hip_accum: f64, + /// KxQ register for values < 32 (larger inverse powers) + kxq0: f64, + /// KxQ register for values >= 32 (tiny inverse powers) + kxq1: f64, + /// Out-of-order flag: when true, HIP updates are skipped + out_of_order: bool, +} + +impl PartialEq for HipEstimator { + fn eq(&self, other: &Self) -> bool { + // For serialization round-trip tests, f64 values should be bit-identical + // after going through binary serialization + self.hip_accum == other.hip_accum + && self.kxq0 == other.kxq0 + && self.kxq1 == other.kxq1 + && self.out_of_order == other.out_of_order + } +} + +impl HipEstimator { + /// Create a new HIP estimator for a sketch with 2^lg_config_k registers + pub fn new(lg_config_k: u8) -> Self { + let k = 1 << lg_config_k; + Self { + hip_accum: 0.0, + kxq0: k as f64, // All registers start at 0, so kxq0 = k * (1/2^0) = k + kxq1: 0.0, + out_of_order: false, + } + } + + /// Update the estimator when a register changes from old_value to new_value + /// + /// This should be called BEFORE actually updating the register in the array. + /// + /// # Algorithm + /// + /// 1. Update HIP accumulator (unless out-of-order) + /// 2. Update KxQ registers (always) + /// + /// The KxQ registers are split for numerical precision: + /// - kxq0: sum of 1/2^v for v < 32 + /// - kxq1: sum of 1/2^v for v >= 32 + pub fn update(&mut self, lg_config_k: u8, old_value: u8, new_value: u8) { + let k = (1 << lg_config_k) as f64; + + // Update HIP accumulator FIRST (unless out-of-order) + // When out-of-order (from deserialization or merge), HIP is invalid + if !self.out_of_order { + self.hip_accum += k / (self.kxq0 + self.kxq1); + } + + // Always update KxQ registers (regardless of OOO flag) + self.update_kxq(old_value, new_value); + } + + /// Update only the KxQ registers (internal helper) + fn update_kxq(&mut self, old_value: u8, new_value: u8) { + // Subtract old value contribution + if old_value < 32 { + self.kxq0 -= inv_pow2(old_value); + } else { + self.kxq1 -= inv_pow2(old_value); + } + + // Add new value contribution + if new_value < 32 { + self.kxq0 += inv_pow2(new_value); + } else { + self.kxq1 += inv_pow2(new_value); + } + } + + /// Get the current cardinality estimate + /// + /// Dispatches to either HIP or composite estimator based on out-of-order flag. + /// + /// # Arguments + /// * `lg_config_k` - Log2 of number of registers (k) + /// * `cur_min` - Current minimum register value (for Array4, 0 for Array6/8) + /// * `num_at_cur_min` - Number of registers at cur_min value + pub fn estimate(&self, lg_config_k: u8, cur_min: u8, num_at_cur_min: u32) -> f64 { + if self.out_of_order { + self.get_composite_estimate(lg_config_k, cur_min, num_at_cur_min) + } else { + self.hip_accum + } + } + + /// Get upper bound for cardinality estimate + /// + /// Returns the upper confidence bound for the cardinality estimate. + pub fn upper_bound( + &self, + lg_config_k: u8, + cur_min: u8, + num_at_cur_min: u32, + num_std_dev: u8, + ) -> f64 { + let estimate = self.estimate(lg_config_k, cur_min, num_at_cur_min); + let rse = get_rel_err(lg_config_k, true, self.out_of_order, num_std_dev); + // RSE is negative for upper bounds, so (1 + rse) < 1, making bound > estimate + estimate / (1.0 + rse) + } + + /// Get lower bound for cardinality estimate + /// + /// Returns the lower confidence bound for the cardinality estimate. + pub fn lower_bound( + &self, + lg_config_k: u8, + cur_min: u8, + num_at_cur_min: u32, + num_std_dev: u8, + ) -> f64 { + let estimate = self.estimate(lg_config_k, cur_min, num_at_cur_min); + let rse = get_rel_err(lg_config_k, false, self.out_of_order, num_std_dev); + // RSE is positive for lower bounds, so (1 + rse) > 1, making bound < estimate + estimate / (1.0 + rse) + } + + /// Get raw HLL estimate using standard HyperLogLog formula + /// + /// Formula: correctionFactor * k^2 / (kxq0 + kxq1) + /// + /// Uses lg_k-specific correction factors for small k. + fn get_raw_estimate(&self, lg_config_k: u8) -> f64 { + let k = (1 << lg_config_k) as f64; + + // Correction factors from empirical analysis + let correction_factor = match lg_config_k { + 4 => 0.673, + 5 => 0.697, + 6 => 0.709, + _ => 0.7213 / (1.0 + 1.079 / k), + }; + + (correction_factor * k * k) / (self.kxq0 + self.kxq1) + } + + /// Get linear counting (bitmap) estimate for small cardinalities + /// + /// Uses harmonic numbers to estimate based on empty registers. + fn get_bitmap_estimate(&self, lg_config_k: u8, cur_min: u8, num_at_cur_min: u32) -> f64 { + let k = 1 << lg_config_k; + + // Number of unhit (empty) buckets + let num_unhit = if cur_min == 0 { num_at_cur_min } else { 0 }; + + // Edge case: all buckets hit + if num_unhit == 0 { + return (k as f64) * (k as f64 / 0.5).ln(); + } + + let num_hit = k - num_unhit; + harmonic_numbers::bitmap_estimate(k, num_hit) + } + + /// Get composite estimate (blends raw HLL and linear counting) + /// + /// This is the primary estimator used when in out-of-order mode. + /// It uses cubic interpolation on raw HLL estimate, then blends + /// with linear counting for small cardinalities. + fn get_composite_estimate(&self, lg_config_k: u8, cur_min: u8, num_at_cur_min: u32) -> f64 { + let raw_est = self.get_raw_estimate(lg_config_k); + + // Get composite interpolation table + let x_arr = composite_interpolation::get_x_arr(lg_config_k); + let x_arr_len = composite_interpolation::get_x_arr_length(); + let y_stride = composite_interpolation::get_y_stride(lg_config_k) as f64; + + // Handle edge cases + if raw_est < x_arr[0] { + return 0.0; + } + + let x_arr_len_m1 = x_arr_len - 1; + + // Above interpolation range: extrapolate linearly + if raw_est > x_arr[x_arr_len_m1] { + let final_y = y_stride * (x_arr_len_m1 as f64); + let factor = final_y / x_arr[x_arr_len_m1]; + return raw_est * factor; + } + + // Interpolate using cubic interpolation + let adj_est = cubic_interpolation::using_x_arr_and_y_stride(x_arr, y_stride, raw_est); + + // Avoid linear counting if estimate is high + // (threshold: 3*k ensures we're above potential linear counting instability) + let k = 1 << lg_config_k; + if adj_est > (3 * k) as f64 { + return adj_est; + } + + // Get linear counting estimate + let lin_est = self.get_bitmap_estimate(lg_config_k, cur_min, num_at_cur_min); + + // Blend estimates based on crossover threshold + // Use average to reduce bias from threshold comparison + let avg_est = (adj_est + lin_est) / 2.0; + + // Crossover thresholds (empirically determined) + let crossover = match lg_config_k { + 4 => 0.718, + 5 => 0.672, + _ => 0.64, + }; + + let threshold = crossover * (k as f64); + + if avg_est > threshold { + adj_est + } else { + lin_est + } + } + + /// Get the HIP accumulator value + pub fn hip_accum(&self) -> f64 { + self.hip_accum + } + + /// Get the kxq0 register value + pub fn kxq0(&self) -> f64 { + self.kxq0 + } + + /// Get the kxq1 register value + pub fn kxq1(&self) -> f64 { + self.kxq1 + } + + /// Check if this estimator is in out-of-order mode + pub fn is_out_of_order(&self) -> bool { + self.out_of_order + } + + /// Set the out-of-order flag + /// + /// This should be set to true when: + /// - Deserializing a sketch from bytes + /// - After a merge/union operation + pub fn set_out_of_order(&mut self, ooo: bool) { + self.out_of_order = ooo; + if ooo { + // When going out-of-order, invalidate HIP accumulator + // (it will be recomputed if needed via composite estimator) + self.hip_accum = 0.0; + } + } + + /// Set the HIP accumulator directly + pub fn set_hip_accum(&mut self, value: f64) { + self.hip_accum = value; + } + + /// Set the kxq0 register directly + pub fn set_kxq0(&mut self, value: f64) { + self.kxq0 = value; + } + + /// Set the kxq1 register directly + pub fn set_kxq1(&mut self, value: f64) { + self.kxq1 = value; + } +} + +/// Compute 1 / 2^value (inverse power of 2) +#[inline] +fn inv_pow2(value: u8) -> f64 { + if value == 0 { + 1.0 + } else if value <= 63 { + 1.0 / (1u64 << value) as f64 + } else { + f64::exp2(-(value as f64)) + } +} + +/// Get relative error for HLL estimates +/// +/// This matches the implementation in datasketches-cpp HllUtil.hpp and RelativeErrorTables.hpp +/// +/// # Arguments +/// * `lg_config_k` - Log2 of number of registers (must be 4-21) +/// * `upper_bound` - Whether computing upper bound (vs lower bound) +/// * `ooo` - Whether sketch is out-of-order (merged/deserialized) +/// * `num_std_dev` - Number of standard deviations (1, 2, or 3) +/// +/// # Returns +/// Relative error factor to apply to estimate +fn get_rel_err(lg_config_k: u8, upper_bound: bool, ooo: bool, num_std_dev: u8) -> f64 { + // For lg_k > 12, use analytical formula with RSE factors + if lg_config_k > 12 { + // RSE factors from Apache DataSketches C++ implementation + // HLL_HIP_RSE_FACTOR = sqrt(ln(2)) ≈ 0.8325546 + // HLL_NON_HIP_RSE_FACTOR = sqrt((3 * ln(2)) - 1) ≈ 1.03896 + let rse_factor = if ooo { + 1.03896 // Non-HIP (out-of-order) + } else { + 0.8325546 // HIP (in-order) + }; + + let k = (1 << lg_config_k) as f64; + let sign = if upper_bound { -1.0 } else { 1.0 }; + + return sign * (num_std_dev as f64) * rse_factor / k.sqrt(); + } + + // For lg_k <= 12, use empirically measured lookup tables + // Tables are indexed by: ((lg_k - 4) * 3) + (num_std_dev - 1) + let idx = ((lg_config_k - 4) * 3 + (num_std_dev - 1)) as usize; Review Comment: ```suggestion // For lg_k <= 12, use empirically measured lookup tables. // Tables are indexed by: ((lg_k - 4) * 3) + (num_std_dev - 1) let idx = ((lg_config_k as usize) - 4) * 3 + ((num_std_dev as usize) - 1); ``` Avoid overflow. While `lg_config_k` should be in `[4, 21]` and would surely fill into `u8`, I wonder if what the range of `num_std_dev` technically. From dataskecthes-java's docs, it seems like num_std_dev must be in `{1, 2, 3}` - https://apache.github.io/datasketches-java/8.0.0/org/apache/datasketches/hll/HllSketch.html#getUpperBound(int) If this is the case, we'd better: 1. Add this information in `HllSketch::upper_bound` and `HllSketch::lower_bound` 2. Verify the input and 1. Panics if it is other value (or else `num_std_dev - 1` can underflow) 2. Even, if `NumStdDev` is a common concept, we can define an Enum to build type safety that users can only input {1, 2, 3}. Ref - https://apache.github.io/datasketches-java/8.0.0/resources/dictionary.html#numStdDev -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
