tisonkun commented on code in PR #45: URL: https://github.com/apache/datasketches-rust/pull/45#discussion_r2653240596
########## datasketches/src/theta/hash_table.rs: ########## @@ -0,0 +1,698 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::hash::Hash; + +use crate::hash::MurmurHash3X64128; + +/// Maximum theta value (signed max for compatibility with Java) +pub const MAX_THETA: u64 = i64::MAX as u64; + +/// Minimum log2 of K +pub const MIN_LG_K: u8 = 5; + +/// Maximum log2 of K +pub const MAX_LG_K: u8 = 26; + +/// Default log2 of K +pub const DEFAULT_LG_K: u8 = 12; + +/// Hash table resize factor +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum ResizeFactor { + /// Resize by factor of 2 + X2 = 2, + /// Resize by factor of 4 + X4 = 4, + /// Resize by factor of 8 + #[default] + X8 = 8, +} + +impl ResizeFactor { + pub fn lg_value(&self) -> u8 { + match self { + ResizeFactor::X2 => 1, + ResizeFactor::X4 => 2, + ResizeFactor::X8 => 3, + } + } +} + +/// Resize threshold (0.5 = 50% load factor) +const RESIZE_THRESHOLD: f64 = 0.5; + +/// Rebuild threshold (15/16 = 93.75% load factor) +const REBUILD_THRESHOLD: f64 = 15.0 / 16.0; + +/// Stride hash bits (7 bits for stride calculation) +const STRIDE_HASH_BITS: u8 = 7; + +/// Stride mask +const STRIDE_MASK: u64 = (1 << STRIDE_HASH_BITS) - 1; + +/// Specific hash table for theta sketch +/// +/// It maintain a array capacity max to [2 ** (lg_max_size)]: +/// - Before it reach the max capacity, it will extend the array based on resize_factor. +/// - After it reach the capacity bigger than 2 ** (lg_nom_size), every time the number of entries +/// exceeds the threshold, it will rebuild the table: only keep the min 2 ** lg_nom_size entries +/// and update the theta to the k-th smallest entry. Review Comment: ```suggestion /// Specific hash table for theta sketch /// /// It maintains an array capacity max to 2^lg_max_size: /// * Before it reaches the max capacity, it will extend the array based on resize_factor. /// * After it reaches the capacity bigger than 2^lg_nom_size, every time the number of entries /// exceeds the threshold, it will rebuild the table: only keep the min 2^lg_nom_size entries /// and update the theta to the k-th smallest entry. ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
