This is an automated email from the ASF dual-hosted git repository. tison pushed a commit to branch codec in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
commit b26d95a986b92f13f54da21f90c8de77b18edc47 Author: tison <[email protected]> AuthorDate: Fri Feb 13 23:59:34 2026 +0800 use family id Signed-off-by: tison <[email protected]> --- datasketches/src/bloom/sketch.rs | 17 +++++------------ datasketches/src/codec/family.rs | 12 ++++++++++++ datasketches/src/countmin/serialization.rs | 1 - datasketches/src/countmin/sketch.rs | 17 +++++------------ datasketches/src/cpc/sketch.rs | 9 +++------ datasketches/src/frequencies/serialization.rs | 2 -- datasketches/src/frequencies/sketch.rs | 18 +++++------------- datasketches/src/hll/array4.rs | 3 ++- datasketches/src/hll/array6.rs | 3 ++- datasketches/src/hll/array8.rs | 3 ++- datasketches/src/hll/hash_set.rs | 3 ++- datasketches/src/hll/list.rs | 3 ++- datasketches/src/hll/serialization.rs | 3 --- datasketches/src/hll/sketch.rs | 8 +++----- datasketches/src/tdigest/serialization.rs | 1 - datasketches/src/tdigest/sketch.rs | 18 +++++++----------- 16 files changed, 50 insertions(+), 71 deletions(-) diff --git a/datasketches/src/bloom/sketch.rs b/datasketches/src/bloom/sketch.rs index 3759739..a1a77b0 100644 --- a/datasketches/src/bloom/sketch.rs +++ b/datasketches/src/bloom/sketch.rs @@ -15,18 +15,17 @@ // specific language governing permissions and limitations // under the License. -use std::hash::Hash; -use std::hash::Hasher; - use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::error::Error; use crate::hash::XxHash64; +use std::hash::Hash; +use std::hash::Hasher; // Serialization constants const PREAMBLE_LONGS_EMPTY: u8 = 3; const PREAMBLE_LONGS_STANDARD: u8 = 4; -const BLOOM_FAMILY_ID: u8 = 21; // Bloom filter family ID const SERIAL_VERSION: u8 = 1; const EMPTY_FLAG_MASK: u8 = 1 << 2; @@ -369,7 +368,7 @@ impl BloomFilter { // Preamble bytes.write_u8(preamble_longs); // Byte 0 bytes.write_u8(SERIAL_VERSION); // Byte 1 - bytes.write_u8(BLOOM_FAMILY_ID); // Byte 2 + bytes.write_u8(Family::BLOOMFILTER.id); // Byte 2 bytes.write_u8(if is_empty { EMPTY_FLAG_MASK } else { 0 }); // Byte 3: flags bytes.write_u16_le(self.num_hashes); // Bytes 4-5 bytes.write_u16_le(0); // Bytes 6-7: unused @@ -432,13 +431,7 @@ impl BloomFilter { .map_err(|_| Error::insufficient_data("flags"))?; // Validate - if family_id != BLOOM_FAMILY_ID { - return Err(Error::invalid_family( - BLOOM_FAMILY_ID, - family_id, - "BloomFilter", - )); - } + Family::BLOOMFILTER.validate_id(family_id)?; if serial_version != SERIAL_VERSION { return Err(Error::unsupported_serial_version( SERIAL_VERSION, diff --git a/datasketches/src/codec/family.rs b/datasketches/src/codec/family.rs index ab741a9..c4e17dc 100644 --- a/datasketches/src/codec/family.rs +++ b/datasketches/src/codec/family.rs @@ -1,3 +1,5 @@ +use crate::error::Error; + /// Defines the various families of sketch and set operation classes. /// /// A family defines a set of classes that share fundamental algorithms and behaviors. The classes @@ -62,3 +64,13 @@ impl Family { max_pre_longs: 4, }; } + +impl Family { + pub fn validate_id(&self, family_id: u8) -> Result<(), Error> { + if family_id != self.id { + Err(Error::invalid_family(self.id, family_id, self.name)) + } else { + Ok(()) + } + } +} diff --git a/datasketches/src/countmin/serialization.rs b/datasketches/src/countmin/serialization.rs index 7d10f59..4f078a9 100644 --- a/datasketches/src/countmin/serialization.rs +++ b/datasketches/src/countmin/serialization.rs @@ -17,6 +17,5 @@ pub(super) const PREAMBLE_LONGS_SHORT: u8 = 2; pub(super) const SERIAL_VERSION: u8 = 1; -pub(super) const COUNTMIN_FAMILY_ID: u8 = 18; pub(super) const FLAGS_IS_EMPTY: u8 = 1 << 0; pub(super) const LONG_SIZE_BYTES: usize = 8; diff --git a/datasketches/src/countmin/sketch.rs b/datasketches/src/countmin/sketch.rs index 4727699..0d6e192 100644 --- a/datasketches/src/countmin/sketch.rs +++ b/datasketches/src/countmin/sketch.rs @@ -15,14 +15,11 @@ // specific language governing permissions and limitations // under the License. -use std::hash::Hash; -use std::hash::Hasher; - use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::countmin::CountMinValue; use crate::countmin::UnsignedCountMinValue; -use crate::countmin::serialization::COUNTMIN_FAMILY_ID; use crate::countmin::serialization::FLAGS_IS_EMPTY; use crate::countmin::serialization::LONG_SIZE_BYTES; use crate::countmin::serialization::PREAMBLE_LONGS_SHORT; @@ -31,6 +28,8 @@ use crate::error::Error; use crate::hash::DEFAULT_UPDATE_SEED; use crate::hash::MurmurHash3X64128; use crate::hash::compute_seed_hash; +use std::hash::Hash; +use std::hash::Hasher; const MAX_TABLE_ENTRIES: usize = 1 << 30; @@ -275,7 +274,7 @@ impl<T: CountMinValue> CountMinSketch<T> { bytes.write_u8(PREAMBLE_LONGS_SHORT); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(COUNTMIN_FAMILY_ID); + bytes.write_u8(Family::COUNTMIN.id); bytes.write_u8(if self.is_empty() { FLAGS_IS_EMPTY } else { 0 }); bytes.write_u32_le(0); // unused @@ -344,13 +343,7 @@ impl<T: CountMinValue> CountMinSketch<T> { let flags = cursor.read_u8().map_err(make_error("flags"))?; cursor.read_u32_le().map_err(make_error("<unused>"))?; - if family_id != COUNTMIN_FAMILY_ID { - return Err(Error::invalid_family( - COUNTMIN_FAMILY_ID, - family_id, - "CountMinSketch", - )); - } + Family::COUNTMIN.validate_id(family_id)?; if serial_version != SERIAL_VERSION { return Err(Error::unsupported_serial_version( SERIAL_VERSION, diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index a1b771b..bf12b47 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -16,7 +16,7 @@ // under the License. use std::hash::Hash; - +use crate::codec::family::Family; use crate::codec::SketchBytes; use crate::codec::SketchSlice; use crate::common::NumStdDev; @@ -433,7 +433,6 @@ impl CpcSketch { } const SERIAL_VERSION: u8 = 1; -const CPC_FAMILY_ID: u8 = 16; const FLAG_COMPRESSED: u8 = 1; const FLAG_HAS_HIP: u8 = 2; const FLAG_HAS_TABLE: u8 = 3; @@ -453,7 +452,7 @@ impl CpcSketch { let preamble_ints = make_preamble_ints(self.num_coupons, has_hip, has_table, has_window); bytes.write_u8(preamble_ints); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(CPC_FAMILY_ID); + bytes.write_u8(Family::CPC.id); bytes.write_u8(self.lg_k); bytes.write_u8(self.first_interesting_column); let flags = (1 << FLAG_COMPRESSED) @@ -515,9 +514,7 @@ impl CpcSketch { let preamble_ints = cursor.read_u8().map_err(make_error("preamble_ints"))?; let serial_version = cursor.read_u8().map_err(make_error("serial_version"))?; let family_id = cursor.read_u8().map_err(make_error("family_id"))?; - if family_id != CPC_FAMILY_ID { - return Err(Error::invalid_family(CPC_FAMILY_ID, family_id, "TDigest")); - } + Family::CPC.validate_id(family_id)?; if serial_version != SERIAL_VERSION { return Err(Error::unsupported_serial_version( SERIAL_VERSION, diff --git a/datasketches/src/frequencies/serialization.rs b/datasketches/src/frequencies/serialization.rs index 44d2891..ed7a898 100644 --- a/datasketches/src/frequencies/serialization.rs +++ b/datasketches/src/frequencies/serialization.rs @@ -21,8 +21,6 @@ use crate::codec::SketchBytes; use crate::codec::SketchSlice; use crate::error::Error; -/// Family ID for frequency sketches. -pub const FREQUENCY_FAMILY_ID: u8 = 10; /// Serialization version. pub const SERIAL_VERSION: u8 = 1; diff --git a/datasketches/src/frequencies/sketch.rs b/datasketches/src/frequencies/sketch.rs index 1534448..831dab5 100644 --- a/datasketches/src/frequencies/sketch.rs +++ b/datasketches/src/frequencies/sketch.rs @@ -17,13 +17,13 @@ //! Frequent items sketch implementations. -use std::hash::Hash; - use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::error::Error; use crate::frequencies::reverse_purge_item_hash_map::ReversePurgeItemHashMap; use crate::frequencies::serialization::*; +use std::hash::Hash; type CountSerializeSize<T> = fn(&[T]) -> usize; type SerializeItems<T> = fn(&mut SketchBytes, &[T]); @@ -409,7 +409,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> { let mut bytes = SketchBytes::with_capacity(8); bytes.write_u8(PREAMBLE_LONGS_EMPTY); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(FREQUENCY_FAMILY_ID); + bytes.write_u8(Family::FREQUENCY.id); bytes.write_u8(self.lg_max_map_size); bytes.write_u8(self.hash_map.lg_length()); bytes.write_u8(EMPTY_FLAG_MASK); @@ -425,7 +425,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> { let mut bytes = SketchBytes::with_capacity(total_bytes); bytes.write_u8(PREAMBLE_LONGS_NONEMPTY); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(FREQUENCY_FAMILY_ID); + bytes.write_u8(Family::FREQUENCY.id); bytes.write_u8(self.lg_max_map_size); bytes.write_u8(self.hash_map.lg_length()); bytes.write_u8(0); // flags @@ -462,21 +462,13 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> { let flags = cursor.read_u8().map_err(make_error("flags"))?; cursor.read_u16_le().map_err(make_error("<unused>"))?; + Family::FREQUENCY.validate_id(family)?; if serial_version != SERIAL_VERSION { return Err(Error::unsupported_serial_version( SERIAL_VERSION, serial_version, )); } - - if family != FREQUENCY_FAMILY_ID { - return Err(Error::invalid_family( - FREQUENCY_FAMILY_ID, - family, - "FrequentItemsSketch", - )); - } - if lg_cur > lg_max { return Err(Error::deserial("lg_cur_map_size exceeds lg_max_map_size")); } diff --git a/datasketches/src/hll/array4.rs b/datasketches/src/hll/array4.rs index d55883c..01367d5 100644 --- a/datasketches/src/hll/array4.rs +++ b/datasketches/src/hll/array4.rs @@ -20,6 +20,7 @@ //! Array4 stores HLL register values using 4 bits per slot (2 slots per byte). //! When values exceed 4 bits after cur_min offset, they're stored in an auxiliary hash map. +use crate::codec::family::Family; use super::aux_map::AuxMap; use crate::codec::SketchBytes; use crate::codec::SketchSlice; @@ -376,7 +377,7 @@ impl Array4 { // Write standard header bytes.write_u8(HLL_PREINTS); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(HLL_FAMILY_ID); + bytes.write_u8(Family::HLL.id); bytes.write_u8(lg_config_k); bytes.write_u8(0); // unused for HLL mode diff --git a/datasketches/src/hll/array6.rs b/datasketches/src/hll/array6.rs index 4e77e0b..95d6dae 100644 --- a/datasketches/src/hll/array6.rs +++ b/datasketches/src/hll/array6.rs @@ -21,6 +21,7 @@ //! This is sufficient for most HLL use cases without needing exception handling or //! cur_min optimization like Array4. +use crate::codec::family::Family; use crate::codec::SketchBytes; use crate::codec::SketchSlice; use crate::common::NumStdDev; @@ -229,7 +230,7 @@ impl Array6 { // Write standard header bytes.write_u8(HLL_PREINTS); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(HLL_FAMILY_ID); + bytes.write_u8(Family::HLL.id); bytes.write_u8(lg_config_k); bytes.write_u8(0); // unused for HLL mode diff --git a/datasketches/src/hll/array8.rs b/datasketches/src/hll/array8.rs index 530d18e..402c7ef 100644 --- a/datasketches/src/hll/array8.rs +++ b/datasketches/src/hll/array8.rs @@ -20,6 +20,7 @@ //! Array8 is the simplest HLL array implementation, storing one byte per slot. //! This provides the maximum value range (0-255) with no bit-packing complexity. +use crate::codec::family::Family; use crate::codec::SketchBytes; use crate::codec::SketchSlice; use crate::common::NumStdDev; @@ -301,7 +302,7 @@ impl Array8 { // Write standard header bytes.write_u8(HLL_PREINTS); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(HLL_FAMILY_ID); + bytes.write_u8(Family::HLL.id); bytes.write_u8(lg_config_k); bytes.write_u8(0); // unused for HLL mode diff --git a/datasketches/src/hll/hash_set.rs b/datasketches/src/hll/hash_set.rs index 874d3a4..1bfe327 100644 --- a/datasketches/src/hll/hash_set.rs +++ b/datasketches/src/hll/hash_set.rs @@ -20,6 +20,7 @@ //! Uses open addressing with a custom stride function to handle collisions. //! Provides better performance than List when many coupons are stored. +use crate::codec::family::Family; use crate::codec::SketchBytes; use crate::codec::SketchSlice; use crate::error::Error; @@ -149,7 +150,7 @@ impl HashSet { // Write preamble bytes.write_u8(HASH_SET_PREINTS); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(HLL_FAMILY_ID); + bytes.write_u8(Family::HLL.id); bytes.write_u8(lg_config_k); bytes.write_u8(lg_arr as u8); diff --git a/datasketches/src/hll/list.rs b/datasketches/src/hll/list.rs index 1abf699..8459e89 100644 --- a/datasketches/src/hll/list.rs +++ b/datasketches/src/hll/list.rs @@ -20,6 +20,7 @@ //! Provides sequential storage with linear search for duplicates. //! Efficient for small numbers of coupons before transitioning to HashSet. +use crate::codec::family::Family; use crate::codec::SketchBytes; use crate::codec::SketchSlice; use crate::error::Error; @@ -111,7 +112,7 @@ impl List { // Write preamble bytes.write_u8(LIST_PREINTS); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(HLL_FAMILY_ID); + bytes.write_u8(Family::HLL.id); bytes.write_u8(lg_config_k); bytes.write_u8(lg_arr as u8); diff --git a/datasketches/src/hll/serialization.rs b/datasketches/src/hll/serialization.rs index 5fdb2b3..014b890 100644 --- a/datasketches/src/hll/serialization.rs +++ b/datasketches/src/hll/serialization.rs @@ -20,9 +20,6 @@ //! This module contains all constants related to the Apache DataSketches //! binary serialization format, shared across all sketch modes. -/// Family ID for HLL sketches in DataSketches format -pub const HLL_FAMILY_ID: u8 = 7; - /// Current serialization version pub const SERIAL_VERSION: u8 = 1; diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs index bc1ce42..4c51209 100644 --- a/datasketches/src/hll/sketch.rs +++ b/datasketches/src/hll/sketch.rs @@ -20,9 +20,8 @@ //! This module provides the main [`HllSketch`] struct, which is the primary interface //! for creating and using HLL sketches for cardinality estimation. -use std::hash::Hash; - use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::common::NumStdDev; use crate::error::Error; use crate::hll::HllType; @@ -37,6 +36,7 @@ use crate::hll::hash_set::HashSet; use crate::hll::list::List; use crate::hll::mode::Mode; use crate::hll::serialization::*; +use std::hash::Hash; /// A HyperLogLog sketch. /// @@ -277,9 +277,7 @@ impl HllSketch { let mode_byte = cursor.read_u8().map_err(make_error("mode"))?; // Verify family ID - if family_id != HLL_FAMILY_ID { - return Err(Error::invalid_family(HLL_FAMILY_ID, family_id, "HLL")); - } + Family::HLL.validate_id(family_id)?; // Verify serialization version if serial_version != SERIAL_VERSION { diff --git a/datasketches/src/tdigest/serialization.rs b/datasketches/src/tdigest/serialization.rs index e5b9788..407e2ac 100644 --- a/datasketches/src/tdigest/serialization.rs +++ b/datasketches/src/tdigest/serialization.rs @@ -18,7 +18,6 @@ pub(super) const PREAMBLE_LONGS_EMPTY_OR_SINGLE: u8 = 1; pub(super) const PREAMBLE_LONGS_MULTIPLE: u8 = 2; pub(super) const SERIAL_VERSION: u8 = 1; -pub(super) const TDIGEST_FAMILY_ID: u8 = 20; pub(super) const FLAGS_IS_EMPTY: u8 = 1 << 0; pub(super) const FLAGS_IS_SINGLE_VALUE: u8 = 1 << 1; pub(super) const FLAGS_REVERSE_MERGE: u8 = 1 << 2; diff --git a/datasketches/src/tdigest/sketch.rs b/datasketches/src/tdigest/sketch.rs index a9ef093..8a67db9 100644 --- a/datasketches/src/tdigest/sketch.rs +++ b/datasketches/src/tdigest/sketch.rs @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. -use std::cmp::Ordering; -use std::convert::identity; -use std::num::NonZeroU64; - use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::error::Error; use crate::tdigest::serialization::*; +use std::cmp::Ordering; +use std::convert::identity; +use std::num::NonZeroU64; /// The default value of K if one is not specified. const DEFAULT_K: u16 = 200; @@ -428,7 +428,7 @@ impl TDigestMut { _ => PREAMBLE_LONGS_MULTIPLE, }); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(TDIGEST_FAMILY_ID); + bytes.write_u8(Family::TDIGEST.id); bytes.write_u16_le(self.k); bytes.write_u8({ let mut flags = 0; @@ -493,15 +493,11 @@ impl TDigestMut { let preamble_longs = cursor.read_u8().map_err(make_error("preamble_longs"))?; let serial_version = cursor.read_u8().map_err(make_error("serial_version"))?; let family_id = cursor.read_u8().map_err(make_error("family_id"))?; - if family_id != TDIGEST_FAMILY_ID { + if let Err(err) = Family::TDIGEST.validate_id(family_id) { return if preamble_longs == 0 && serial_version == 0 && family_id == 0 { Self::deserialize_compat(bytes) } else { - Err(Error::invalid_family( - TDIGEST_FAMILY_ID, - family_id, - "TDigest", - )) + Err(err) }; } if serial_version != SERIAL_VERSION { --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
