This is an automated email from the ASF dual-hosted git repository.

tison pushed a commit to branch codec
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git

commit b26d95a986b92f13f54da21f90c8de77b18edc47
Author: tison <[email protected]>
AuthorDate: Fri Feb 13 23:59:34 2026 +0800

    use family id
    
    Signed-off-by: tison <[email protected]>
---
 datasketches/src/bloom/sketch.rs              | 17 +++++------------
 datasketches/src/codec/family.rs              | 12 ++++++++++++
 datasketches/src/countmin/serialization.rs    |  1 -
 datasketches/src/countmin/sketch.rs           | 17 +++++------------
 datasketches/src/cpc/sketch.rs                |  9 +++------
 datasketches/src/frequencies/serialization.rs |  2 --
 datasketches/src/frequencies/sketch.rs        | 18 +++++-------------
 datasketches/src/hll/array4.rs                |  3 ++-
 datasketches/src/hll/array6.rs                |  3 ++-
 datasketches/src/hll/array8.rs                |  3 ++-
 datasketches/src/hll/hash_set.rs              |  3 ++-
 datasketches/src/hll/list.rs                  |  3 ++-
 datasketches/src/hll/serialization.rs         |  3 ---
 datasketches/src/hll/sketch.rs                |  8 +++-----
 datasketches/src/tdigest/serialization.rs     |  1 -
 datasketches/src/tdigest/sketch.rs            | 18 +++++++-----------
 16 files changed, 50 insertions(+), 71 deletions(-)

diff --git a/datasketches/src/bloom/sketch.rs b/datasketches/src/bloom/sketch.rs
index 3759739..a1a77b0 100644
--- a/datasketches/src/bloom/sketch.rs
+++ b/datasketches/src/bloom/sketch.rs
@@ -15,18 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::hash::Hash;
-use std::hash::Hasher;
-
 use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
+use crate::codec::family::Family;
 use crate::error::Error;
 use crate::hash::XxHash64;
+use std::hash::Hash;
+use std::hash::Hasher;
 
 // Serialization constants
 const PREAMBLE_LONGS_EMPTY: u8 = 3;
 const PREAMBLE_LONGS_STANDARD: u8 = 4;
-const BLOOM_FAMILY_ID: u8 = 21; // Bloom filter family ID
 const SERIAL_VERSION: u8 = 1;
 const EMPTY_FLAG_MASK: u8 = 1 << 2;
 
@@ -369,7 +368,7 @@ impl BloomFilter {
         // Preamble
         bytes.write_u8(preamble_longs); // Byte 0
         bytes.write_u8(SERIAL_VERSION); // Byte 1
-        bytes.write_u8(BLOOM_FAMILY_ID); // Byte 2
+        bytes.write_u8(Family::BLOOMFILTER.id); // Byte 2
         bytes.write_u8(if is_empty { EMPTY_FLAG_MASK } else { 0 }); // Byte 3: 
flags
         bytes.write_u16_le(self.num_hashes); // Bytes 4-5
         bytes.write_u16_le(0); // Bytes 6-7: unused
@@ -432,13 +431,7 @@ impl BloomFilter {
             .map_err(|_| Error::insufficient_data("flags"))?;
 
         // Validate
-        if family_id != BLOOM_FAMILY_ID {
-            return Err(Error::invalid_family(
-                BLOOM_FAMILY_ID,
-                family_id,
-                "BloomFilter",
-            ));
-        }
+        Family::BLOOMFILTER.validate_id(family_id)?;
         if serial_version != SERIAL_VERSION {
             return Err(Error::unsupported_serial_version(
                 SERIAL_VERSION,
diff --git a/datasketches/src/codec/family.rs b/datasketches/src/codec/family.rs
index ab741a9..c4e17dc 100644
--- a/datasketches/src/codec/family.rs
+++ b/datasketches/src/codec/family.rs
@@ -1,3 +1,5 @@
+use crate::error::Error;
+
 /// Defines the various families of sketch and set operation classes.
 ///
 /// A family defines a set of classes that share fundamental algorithms and 
behaviors. The classes
@@ -62,3 +64,13 @@ impl Family {
         max_pre_longs: 4,
     };
 }
+
+impl Family {
+    pub fn validate_id(&self, family_id: u8) -> Result<(), Error> {
+        if family_id != self.id {
+            Err(Error::invalid_family(self.id, family_id, self.name))
+        } else {
+            Ok(())
+        }
+    }
+}
diff --git a/datasketches/src/countmin/serialization.rs 
b/datasketches/src/countmin/serialization.rs
index 7d10f59..4f078a9 100644
--- a/datasketches/src/countmin/serialization.rs
+++ b/datasketches/src/countmin/serialization.rs
@@ -17,6 +17,5 @@
 
 pub(super) const PREAMBLE_LONGS_SHORT: u8 = 2;
 pub(super) const SERIAL_VERSION: u8 = 1;
-pub(super) const COUNTMIN_FAMILY_ID: u8 = 18;
 pub(super) const FLAGS_IS_EMPTY: u8 = 1 << 0;
 pub(super) const LONG_SIZE_BYTES: usize = 8;
diff --git a/datasketches/src/countmin/sketch.rs 
b/datasketches/src/countmin/sketch.rs
index 4727699..0d6e192 100644
--- a/datasketches/src/countmin/sketch.rs
+++ b/datasketches/src/countmin/sketch.rs
@@ -15,14 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::hash::Hash;
-use std::hash::Hasher;
-
 use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
+use crate::codec::family::Family;
 use crate::countmin::CountMinValue;
 use crate::countmin::UnsignedCountMinValue;
-use crate::countmin::serialization::COUNTMIN_FAMILY_ID;
 use crate::countmin::serialization::FLAGS_IS_EMPTY;
 use crate::countmin::serialization::LONG_SIZE_BYTES;
 use crate::countmin::serialization::PREAMBLE_LONGS_SHORT;
@@ -31,6 +28,8 @@ use crate::error::Error;
 use crate::hash::DEFAULT_UPDATE_SEED;
 use crate::hash::MurmurHash3X64128;
 use crate::hash::compute_seed_hash;
+use std::hash::Hash;
+use std::hash::Hasher;
 
 const MAX_TABLE_ENTRIES: usize = 1 << 30;
 
@@ -275,7 +274,7 @@ impl<T: CountMinValue> CountMinSketch<T> {
 
         bytes.write_u8(PREAMBLE_LONGS_SHORT);
         bytes.write_u8(SERIAL_VERSION);
-        bytes.write_u8(COUNTMIN_FAMILY_ID);
+        bytes.write_u8(Family::COUNTMIN.id);
         bytes.write_u8(if self.is_empty() { FLAGS_IS_EMPTY } else { 0 });
         bytes.write_u32_le(0); // unused
 
@@ -344,13 +343,7 @@ impl<T: CountMinValue> CountMinSketch<T> {
         let flags = cursor.read_u8().map_err(make_error("flags"))?;
         cursor.read_u32_le().map_err(make_error("<unused>"))?;
 
-        if family_id != COUNTMIN_FAMILY_ID {
-            return Err(Error::invalid_family(
-                COUNTMIN_FAMILY_ID,
-                family_id,
-                "CountMinSketch",
-            ));
-        }
+        Family::COUNTMIN.validate_id(family_id)?;
         if serial_version != SERIAL_VERSION {
             return Err(Error::unsupported_serial_version(
                 SERIAL_VERSION,
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
index a1b771b..bf12b47 100644
--- a/datasketches/src/cpc/sketch.rs
+++ b/datasketches/src/cpc/sketch.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use std::hash::Hash;
-
+use crate::codec::family::Family;
 use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
 use crate::common::NumStdDev;
@@ -433,7 +433,6 @@ impl CpcSketch {
 }
 
 const SERIAL_VERSION: u8 = 1;
-const CPC_FAMILY_ID: u8 = 16;
 const FLAG_COMPRESSED: u8 = 1;
 const FLAG_HAS_HIP: u8 = 2;
 const FLAG_HAS_TABLE: u8 = 3;
@@ -453,7 +452,7 @@ impl CpcSketch {
         let preamble_ints = make_preamble_ints(self.num_coupons, has_hip, 
has_table, has_window);
         bytes.write_u8(preamble_ints);
         bytes.write_u8(SERIAL_VERSION);
-        bytes.write_u8(CPC_FAMILY_ID);
+        bytes.write_u8(Family::CPC.id);
         bytes.write_u8(self.lg_k);
         bytes.write_u8(self.first_interesting_column);
         let flags = (1 << FLAG_COMPRESSED)
@@ -515,9 +514,7 @@ impl CpcSketch {
         let preamble_ints = 
cursor.read_u8().map_err(make_error("preamble_ints"))?;
         let serial_version = 
cursor.read_u8().map_err(make_error("serial_version"))?;
         let family_id = cursor.read_u8().map_err(make_error("family_id"))?;
-        if family_id != CPC_FAMILY_ID {
-            return Err(Error::invalid_family(CPC_FAMILY_ID, family_id, 
"TDigest"));
-        }
+        Family::CPC.validate_id(family_id)?;
         if serial_version != SERIAL_VERSION {
             return Err(Error::unsupported_serial_version(
                 SERIAL_VERSION,
diff --git a/datasketches/src/frequencies/serialization.rs 
b/datasketches/src/frequencies/serialization.rs
index 44d2891..ed7a898 100644
--- a/datasketches/src/frequencies/serialization.rs
+++ b/datasketches/src/frequencies/serialization.rs
@@ -21,8 +21,6 @@ use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
 use crate::error::Error;
 
-/// Family ID for frequency sketches.
-pub const FREQUENCY_FAMILY_ID: u8 = 10;
 /// Serialization version.
 pub const SERIAL_VERSION: u8 = 1;
 
diff --git a/datasketches/src/frequencies/sketch.rs 
b/datasketches/src/frequencies/sketch.rs
index 1534448..831dab5 100644
--- a/datasketches/src/frequencies/sketch.rs
+++ b/datasketches/src/frequencies/sketch.rs
@@ -17,13 +17,13 @@
 
 //! Frequent items sketch implementations.
 
-use std::hash::Hash;
-
 use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
+use crate::codec::family::Family;
 use crate::error::Error;
 use crate::frequencies::reverse_purge_item_hash_map::ReversePurgeItemHashMap;
 use crate::frequencies::serialization::*;
+use std::hash::Hash;
 
 type CountSerializeSize<T> = fn(&[T]) -> usize;
 type SerializeItems<T> = fn(&mut SketchBytes, &[T]);
@@ -409,7 +409,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
             let mut bytes = SketchBytes::with_capacity(8);
             bytes.write_u8(PREAMBLE_LONGS_EMPTY);
             bytes.write_u8(SERIAL_VERSION);
-            bytes.write_u8(FREQUENCY_FAMILY_ID);
+            bytes.write_u8(Family::FREQUENCY.id);
             bytes.write_u8(self.lg_max_map_size);
             bytes.write_u8(self.hash_map.lg_length());
             bytes.write_u8(EMPTY_FLAG_MASK);
@@ -425,7 +425,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
         let mut bytes = SketchBytes::with_capacity(total_bytes);
         bytes.write_u8(PREAMBLE_LONGS_NONEMPTY);
         bytes.write_u8(SERIAL_VERSION);
-        bytes.write_u8(FREQUENCY_FAMILY_ID);
+        bytes.write_u8(Family::FREQUENCY.id);
         bytes.write_u8(self.lg_max_map_size);
         bytes.write_u8(self.hash_map.lg_length());
         bytes.write_u8(0); // flags
@@ -462,21 +462,13 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
         let flags = cursor.read_u8().map_err(make_error("flags"))?;
         cursor.read_u16_le().map_err(make_error("<unused>"))?;
 
+        Family::FREQUENCY.validate_id(family)?;
         if serial_version != SERIAL_VERSION {
             return Err(Error::unsupported_serial_version(
                 SERIAL_VERSION,
                 serial_version,
             ));
         }
-
-        if family != FREQUENCY_FAMILY_ID {
-            return Err(Error::invalid_family(
-                FREQUENCY_FAMILY_ID,
-                family,
-                "FrequentItemsSketch",
-            ));
-        }
-
         if lg_cur > lg_max {
             return Err(Error::deserial("lg_cur_map_size exceeds 
lg_max_map_size"));
         }
diff --git a/datasketches/src/hll/array4.rs b/datasketches/src/hll/array4.rs
index d55883c..01367d5 100644
--- a/datasketches/src/hll/array4.rs
+++ b/datasketches/src/hll/array4.rs
@@ -20,6 +20,7 @@
 //! Array4 stores HLL register values using 4 bits per slot (2 slots per byte).
 //! When values exceed 4 bits after cur_min offset, they're stored in an 
auxiliary hash map.
 
+use crate::codec::family::Family;
 use super::aux_map::AuxMap;
 use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
@@ -376,7 +377,7 @@ impl Array4 {
         // Write standard header
         bytes.write_u8(HLL_PREINTS);
         bytes.write_u8(SERIAL_VERSION);
-        bytes.write_u8(HLL_FAMILY_ID);
+        bytes.write_u8(Family::HLL.id);
         bytes.write_u8(lg_config_k);
         bytes.write_u8(0); // unused for HLL mode
 
diff --git a/datasketches/src/hll/array6.rs b/datasketches/src/hll/array6.rs
index 4e77e0b..95d6dae 100644
--- a/datasketches/src/hll/array6.rs
+++ b/datasketches/src/hll/array6.rs
@@ -21,6 +21,7 @@
 //! This is sufficient for most HLL use cases without needing exception 
handling or
 //! cur_min optimization like Array4.
 
+use crate::codec::family::Family;
 use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
 use crate::common::NumStdDev;
@@ -229,7 +230,7 @@ impl Array6 {
         // Write standard header
         bytes.write_u8(HLL_PREINTS);
         bytes.write_u8(SERIAL_VERSION);
-        bytes.write_u8(HLL_FAMILY_ID);
+        bytes.write_u8(Family::HLL.id);
         bytes.write_u8(lg_config_k);
         bytes.write_u8(0); // unused for HLL mode
 
diff --git a/datasketches/src/hll/array8.rs b/datasketches/src/hll/array8.rs
index 530d18e..402c7ef 100644
--- a/datasketches/src/hll/array8.rs
+++ b/datasketches/src/hll/array8.rs
@@ -20,6 +20,7 @@
 //! Array8 is the simplest HLL array implementation, storing one byte per slot.
 //! This provides the maximum value range (0-255) with no bit-packing 
complexity.
 
+use crate::codec::family::Family;
 use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
 use crate::common::NumStdDev;
@@ -301,7 +302,7 @@ impl Array8 {
         // Write standard header
         bytes.write_u8(HLL_PREINTS);
         bytes.write_u8(SERIAL_VERSION);
-        bytes.write_u8(HLL_FAMILY_ID);
+        bytes.write_u8(Family::HLL.id);
         bytes.write_u8(lg_config_k);
         bytes.write_u8(0); // unused for HLL mode
 
diff --git a/datasketches/src/hll/hash_set.rs b/datasketches/src/hll/hash_set.rs
index 874d3a4..1bfe327 100644
--- a/datasketches/src/hll/hash_set.rs
+++ b/datasketches/src/hll/hash_set.rs
@@ -20,6 +20,7 @@
 //! Uses open addressing with a custom stride function to handle collisions.
 //! Provides better performance than List when many coupons are stored.
 
+use crate::codec::family::Family;
 use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
 use crate::error::Error;
@@ -149,7 +150,7 @@ impl HashSet {
         // Write preamble
         bytes.write_u8(HASH_SET_PREINTS);
         bytes.write_u8(SERIAL_VERSION);
-        bytes.write_u8(HLL_FAMILY_ID);
+        bytes.write_u8(Family::HLL.id);
         bytes.write_u8(lg_config_k);
         bytes.write_u8(lg_arr as u8);
 
diff --git a/datasketches/src/hll/list.rs b/datasketches/src/hll/list.rs
index 1abf699..8459e89 100644
--- a/datasketches/src/hll/list.rs
+++ b/datasketches/src/hll/list.rs
@@ -20,6 +20,7 @@
 //! Provides sequential storage with linear search for duplicates.
 //! Efficient for small numbers of coupons before transitioning to HashSet.
 
+use crate::codec::family::Family;
 use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
 use crate::error::Error;
@@ -111,7 +112,7 @@ impl List {
         // Write preamble
         bytes.write_u8(LIST_PREINTS);
         bytes.write_u8(SERIAL_VERSION);
-        bytes.write_u8(HLL_FAMILY_ID);
+        bytes.write_u8(Family::HLL.id);
         bytes.write_u8(lg_config_k);
         bytes.write_u8(lg_arr as u8);
 
diff --git a/datasketches/src/hll/serialization.rs 
b/datasketches/src/hll/serialization.rs
index 5fdb2b3..014b890 100644
--- a/datasketches/src/hll/serialization.rs
+++ b/datasketches/src/hll/serialization.rs
@@ -20,9 +20,6 @@
 //! This module contains all constants related to the Apache DataSketches
 //! binary serialization format, shared across all sketch modes.
 
-/// Family ID for HLL sketches in DataSketches format
-pub const HLL_FAMILY_ID: u8 = 7;
-
 /// Current serialization version
 pub const SERIAL_VERSION: u8 = 1;
 
diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs
index bc1ce42..4c51209 100644
--- a/datasketches/src/hll/sketch.rs
+++ b/datasketches/src/hll/sketch.rs
@@ -20,9 +20,8 @@
 //! This module provides the main [`HllSketch`] struct, which is the primary 
interface
 //! for creating and using HLL sketches for cardinality estimation.
 
-use std::hash::Hash;
-
 use crate::codec::SketchSlice;
+use crate::codec::family::Family;
 use crate::common::NumStdDev;
 use crate::error::Error;
 use crate::hll::HllType;
@@ -37,6 +36,7 @@ use crate::hll::hash_set::HashSet;
 use crate::hll::list::List;
 use crate::hll::mode::Mode;
 use crate::hll::serialization::*;
+use std::hash::Hash;
 
 /// A HyperLogLog sketch.
 ///
@@ -277,9 +277,7 @@ impl HllSketch {
         let mode_byte = cursor.read_u8().map_err(make_error("mode"))?;
 
         // Verify family ID
-        if family_id != HLL_FAMILY_ID {
-            return Err(Error::invalid_family(HLL_FAMILY_ID, family_id, "HLL"));
-        }
+        Family::HLL.validate_id(family_id)?;
 
         // Verify serialization version
         if serial_version != SERIAL_VERSION {
diff --git a/datasketches/src/tdigest/serialization.rs 
b/datasketches/src/tdigest/serialization.rs
index e5b9788..407e2ac 100644
--- a/datasketches/src/tdigest/serialization.rs
+++ b/datasketches/src/tdigest/serialization.rs
@@ -18,7 +18,6 @@
 pub(super) const PREAMBLE_LONGS_EMPTY_OR_SINGLE: u8 = 1;
 pub(super) const PREAMBLE_LONGS_MULTIPLE: u8 = 2;
 pub(super) const SERIAL_VERSION: u8 = 1;
-pub(super) const TDIGEST_FAMILY_ID: u8 = 20;
 pub(super) const FLAGS_IS_EMPTY: u8 = 1 << 0;
 pub(super) const FLAGS_IS_SINGLE_VALUE: u8 = 1 << 1;
 pub(super) const FLAGS_REVERSE_MERGE: u8 = 1 << 2;
diff --git a/datasketches/src/tdigest/sketch.rs 
b/datasketches/src/tdigest/sketch.rs
index a9ef093..8a67db9 100644
--- a/datasketches/src/tdigest/sketch.rs
+++ b/datasketches/src/tdigest/sketch.rs
@@ -15,14 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::cmp::Ordering;
-use std::convert::identity;
-use std::num::NonZeroU64;
-
 use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
+use crate::codec::family::Family;
 use crate::error::Error;
 use crate::tdigest::serialization::*;
+use std::cmp::Ordering;
+use std::convert::identity;
+use std::num::NonZeroU64;
 
 /// The default value of K if one is not specified.
 const DEFAULT_K: u16 = 200;
@@ -428,7 +428,7 @@ impl TDigestMut {
             _ => PREAMBLE_LONGS_MULTIPLE,
         });
         bytes.write_u8(SERIAL_VERSION);
-        bytes.write_u8(TDIGEST_FAMILY_ID);
+        bytes.write_u8(Family::TDIGEST.id);
         bytes.write_u16_le(self.k);
         bytes.write_u8({
             let mut flags = 0;
@@ -493,15 +493,11 @@ impl TDigestMut {
         let preamble_longs = 
cursor.read_u8().map_err(make_error("preamble_longs"))?;
         let serial_version = 
cursor.read_u8().map_err(make_error("serial_version"))?;
         let family_id = cursor.read_u8().map_err(make_error("family_id"))?;
-        if family_id != TDIGEST_FAMILY_ID {
+        if let Err(err) = Family::TDIGEST.validate_id(family_id) {
             return if preamble_longs == 0 && serial_version == 0 && family_id 
== 0 {
                 Self::deserialize_compat(bytes)
             } else {
-                Err(Error::invalid_family(
-                    TDIGEST_FAMILY_ID,
-                    family_id,
-                    "TDigest",
-                ))
+                Err(err)
             };
         }
         if serial_version != SERIAL_VERSION {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to