tisonkun commented on code in PR #1: URL: https://github.com/apache/datasketches-rust/pull/1#discussion_r2617190889
########## src/hll/array4.rs: ########## @@ -0,0 +1,472 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! HyperLogLog Array4 mode - 4-bit packed representation with exception handling +//! +//! Array4 stores HLL register values using 4 bits per slot (2 slots per byte). +//! When values exceed 4 bits after cur_min offset, they're stored in an auxiliary hash map. + +use super::aux_map::AuxMap; +use crate::error::SerdeResult; +use crate::hll::estimator::HipEstimator; +use crate::hll::{NumStdDev, get_slot, get_value}; + +const AUX_TOKEN: u8 = 15; + +/// Core Array4 data structure - stores 4-bit values efficiently +#[derive(Debug, Clone, PartialEq)] +pub struct Array4 { + lg_config_k: u8, + /// Packed 4-bit values: 2 values per byte + /// Even slots use low nibble, odd slots use high nibble + bytes: Box<[u8]>, + /// Current minimum value offset (optimization to delay aux map creation) + cur_min: u8, + /// Count of slots at exactly cur_min (when 0, increment cur_min) + num_at_cur_min: u32, + /// Exception table for values >= 15 after cur_min offset + aux_map: Option<AuxMap>, + /// HIP estimator for cardinality estimation + estimator: HipEstimator, +} + +impl Array4 { + pub fn new(lg_config_k: u8) -> Self { + let num_bytes = 1 << (lg_config_k - 1); + let num_at_cur_min = 1 << lg_config_k; + Self { + lg_config_k, + bytes: vec![0u8; num_bytes].into_boxed_slice(), + cur_min: 0, + num_at_cur_min, + aux_map: None, + estimator: HipEstimator::new(lg_config_k), + } + } + + /// Get raw 4-bit value from slot (not adjusted for cur_min) + #[inline] + fn get_raw(&self, slot: u32) -> u8 { + debug_assert!(slot >> 1 < self.bytes.len() as u32); + + let byte = self.bytes[(slot >> 1) as usize]; + if slot & 1 == 0 { + byte & 15 // low nibble for even slots + } else { + byte >> 4 // high nibble for odd slots + } + } + + /// Set raw 4-bit value in slot + #[inline] + fn put_raw(&mut self, slot: u32, value: u8) { + debug_assert!(value <= AUX_TOKEN); + debug_assert!(slot >> 1 < self.bytes.len() as u32); + + let byte_idx = (slot >> 1) as usize; + let old_byte = self.bytes[byte_idx]; + self.bytes[byte_idx] = if slot & 1 == 0 { + (old_byte & 0xF0) | (value & 0x0F) // set low nibble + } else { + (old_byte & 0x0F) | (value << 4) // set high nibble + }; + } + + pub fn update(&mut self, coupon: u32) { + let mask = (1 << self.lg_config_k) - 1; + let slot = get_slot(coupon) & mask; + let new_value = get_value(coupon); + + // Quick rejection: if new value <= cur_min, no update needed + if new_value <= self.cur_min { + return; + } + + let raw_stored = self.get_raw(slot); + let lower_bound = raw_stored + self.cur_min; + + if new_value <= lower_bound { + return; + } + + // Get actual old value (might be in aux map) + let old_value = if raw_stored < AUX_TOKEN { + lower_bound + } else { + self.aux_map + .as_ref() + .expect("AUX_TOKEN without aux_map") + .get(slot) + .expect("AUX_TOKEN but slot not in aux_map") + }; + + if new_value <= old_value { + return; + } + + // Update HIP and KxQ registers via estimator + self.estimator + .update(self.lg_config_k, old_value, new_value); + + let shifted_new = new_value - self.cur_min; + + // Four cases based on old/new exception status + match (raw_stored, shifted_new) { + // Case 1: Both old and new are exceptions + (AUX_TOKEN, shifted) if shifted >= AUX_TOKEN => { + self.aux_map.as_mut().unwrap().replace(slot, new_value); Review Comment: `unwrap` and `expect` are not quite different from a technical view. So just take the context into consideration, and since we have the source code open for everybody, they should be able to check the source code when needed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
