This is an automated email from the ASF dual-hosted git repository. tison pushed a commit to branch codec in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
commit b5c41679b04cb5092d140552adf9adfad6b8fa7e Author: tison <[email protected]> AuthorDate: Fri Feb 13 23:51:13 2026 +0800 refactor: expose codec and add centralized Family Signed-off-by: tison <[email protected]> --- datasketches/src/{codec.rs => codec/decode.rs} | 138 +++++-------------------- datasketches/src/codec/encode.rs | 113 ++++++++++++++++++++ datasketches/src/codec/family.rs | 64 ++++++++++++ datasketches/src/codec/mod.rs | 27 +++++ datasketches/src/lib.rs | 2 +- 5 files changed, 229 insertions(+), 115 deletions(-) diff --git a/datasketches/src/codec.rs b/datasketches/src/codec/decode.rs similarity index 54% rename from datasketches/src/codec.rs rename to datasketches/src/codec/decode.rs index 4df7b22..52b005e 100644 --- a/datasketches/src/codec.rs +++ b/datasketches/src/codec/decode.rs @@ -1,240 +1,150 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#![allow(dead_code)] - use std::io; -use std::io::Cursor; -use std::io::Read; - -pub(crate) struct SketchBytes { - bytes: Vec<u8>, -} - -impl SketchBytes { - pub fn with_capacity(capacity: usize) -> Self { - Self { - bytes: Vec::with_capacity(capacity), - } - } - - pub fn into_bytes(self) -> Vec<u8> { - self.bytes - } - - pub fn write(&mut self, buf: &[u8]) { - self.bytes.extend_from_slice(buf); - } - - pub fn write_u8(&mut self, n: u8) { - self.bytes.push(n); - } - - pub fn write_i8(&mut self, n: i8) { - self.bytes.push(n as u8); - } - - pub fn write_u16_le(&mut self, n: u16) { - self.write(&n.to_le_bytes()); - } - - pub fn write_u16_be(&mut self, n: u16) { - self.write(&n.to_be_bytes()); - } - - pub fn write_i16_le(&mut self, n: i16) { - self.write(&n.to_le_bytes()); - } - - pub fn write_i16_be(&mut self, n: i16) { - self.write(&n.to_be_bytes()); - } - - pub fn write_u32_le(&mut self, n: u32) { - self.write(&n.to_le_bytes()); - } - - pub fn write_u32_be(&mut self, n: u32) { - self.write(&n.to_be_bytes()); - } - - pub fn write_i32_le(&mut self, n: i32) { - self.write(&n.to_le_bytes()); - } - - pub fn write_i32_be(&mut self, n: i32) { - self.write(&n.to_be_bytes()); - } - - pub fn write_u64_le(&mut self, n: u64) { - self.write(&n.to_le_bytes()); - } - - pub fn write_u64_be(&mut self, n: u64) { - self.write(&n.to_be_bytes()); - } - - pub fn write_i64_le(&mut self, n: i64) { - self.write(&n.to_le_bytes()); - } - - pub fn write_i64_be(&mut self, n: i64) { - self.write(&n.to_be_bytes()); - } - - pub fn write_f32_le(&mut self, n: f32) { - self.write(&n.to_le_bytes()); - } - - pub fn write_f32_be(&mut self, n: f32) { - self.write(&n.to_be_bytes()); - } - - pub fn write_f64_le(&mut self, n: f64) { - self.write(&n.to_le_bytes()); - } - - pub fn write_f64_be(&mut self, n: f64) { - self.write(&n.to_be_bytes()); - } -} +use std::io::{Cursor, Read}; -pub(crate) struct SketchSlice<'a> { +/// A wrapper around a byte slice that provides methods for reading various types of data from it. +pub struct SketchSlice<'a> { slice: Cursor<&'a [u8]>, } impl SketchSlice<'_> { + /// Creates a new `SketchSlice` from the given byte slice. pub fn new(slice: &[u8]) -> SketchSlice<'_> { SketchSlice { slice: Cursor::new(slice), } } + /// Advances the position of the slice by `n` bytes. pub fn advance(&mut self, n: u64) { let pos = self.slice.position(); self.slice.set_position(pos + n); } + /// Reads exactly `buf.len()` bytes from the slice into `buf`. pub fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> { self.slice.read_exact(buf) } + /// Reads a single byte from the slice and returns it as a `u8`. pub fn read_u8(&mut self) -> io::Result<u8> { let mut buf = [0u8; 1]; self.read_exact(&mut buf)?; Ok(buf[0]) } + /// Reads a single byte from the slice and returns it as an `i8`. pub fn read_i8(&mut self) -> io::Result<i8> { let mut buf = [0u8; 1]; self.read_exact(&mut buf)?; Ok(buf[0] as i8) } + /// Reads a 16-bit unsigned integer from the slice in little-endian byte order. pub fn read_u16_le(&mut self) -> io::Result<u16> { let mut buf = [0u8; 2]; self.read_exact(&mut buf)?; Ok(u16::from_le_bytes(buf)) } + /// Reads a 16-bit unsigned integer from the slice in big-endian byte order. pub fn read_u16_be(&mut self) -> io::Result<u16> { let mut buf = [0u8; 2]; self.read_exact(&mut buf)?; Ok(u16::from_be_bytes(buf)) } + /// Reads a 16-bit signed integer from the slice in little-endian byte order. pub fn read_i16_le(&mut self) -> io::Result<i16> { let mut buf = [0u8; 2]; self.read_exact(&mut buf)?; Ok(i16::from_le_bytes(buf)) } + /// Reads a 16-bit signed integer from the slice in big-endian byte order. pub fn read_i16_be(&mut self) -> io::Result<i16> { let mut buf = [0u8; 2]; self.read_exact(&mut buf)?; Ok(i16::from_be_bytes(buf)) } + /// Reads a 32-bit unsigned integer from the slice in little-endian byte order. pub fn read_u32_le(&mut self) -> io::Result<u32> { let mut buf = [0u8; 4]; self.read_exact(&mut buf)?; Ok(u32::from_le_bytes(buf)) } + /// Reads a 32-bit unsigned integer from the slice in big-endian byte order. pub fn read_u32_be(&mut self) -> io::Result<u32> { let mut buf = [0u8; 4]; self.read_exact(&mut buf)?; Ok(u32::from_be_bytes(buf)) } + /// Reads a 32-bit signed integer from the slice in little-endian byte order. pub fn read_i32_le(&mut self) -> io::Result<i32> { let mut buf = [0u8; 4]; self.read_exact(&mut buf)?; Ok(i32::from_le_bytes(buf)) } + /// Reads a 32-bit signed integer from the slice in big-endian byte order. pub fn read_i32_be(&mut self) -> io::Result<i32> { let mut buf = [0u8; 4]; self.read_exact(&mut buf)?; Ok(i32::from_be_bytes(buf)) } + /// Reads a 16-bit unsigned integer from the slice in little-endian byte order. pub fn read_u64_le(&mut self) -> io::Result<u64> { let mut buf = [0u8; 8]; self.read_exact(&mut buf)?; Ok(u64::from_le_bytes(buf)) } + /// Reads a 16-bit unsigned integer from the slice in big-endian byte order. pub fn read_u64_be(&mut self) -> io::Result<u64> { let mut buf = [0u8; 8]; self.read_exact(&mut buf)?; Ok(u64::from_be_bytes(buf)) } + /// Reads a 16-bit signed integer from the slice in little-endian byte order. pub fn read_i64_le(&mut self) -> io::Result<i64> { let mut buf = [0u8; 8]; self.read_exact(&mut buf)?; Ok(i64::from_le_bytes(buf)) } + /// Reads a 16-bit signed integer from the slice in big-endian byte order. pub fn read_i64_be(&mut self) -> io::Result<i64> { let mut buf = [0u8; 8]; self.read_exact(&mut buf)?; Ok(i64::from_be_bytes(buf)) } + /// Reads a 32-bit floating-point number from the slice in little-endian byte order. pub fn read_f32_le(&mut self) -> io::Result<f32> { let mut buf = [0u8; 4]; self.read_exact(&mut buf)?; Ok(f32::from_le_bytes(buf)) } + /// Reads a 32-bit floating-point number from the slice in big-endian byte order. pub fn read_f32_be(&mut self) -> io::Result<f32> { let mut buf = [0u8; 4]; self.read_exact(&mut buf)?; Ok(f32::from_be_bytes(buf)) } + /// Reads a 64-bit floating-point number from the slice in little-endian byte order. pub fn read_f64_le(&mut self) -> io::Result<f64> { let mut buf = [0u8; 8]; self.read_exact(&mut buf)?; Ok(f64::from_le_bytes(buf)) } + /// Reads a 64-bit floating-point number from the slice in big-endian byte order. pub fn read_f64_be(&mut self) -> io::Result<f64> { let mut buf = [0u8; 8]; self.read_exact(&mut buf)?; diff --git a/datasketches/src/codec/encode.rs b/datasketches/src/codec/encode.rs new file mode 100644 index 0000000..0f1423f --- /dev/null +++ b/datasketches/src/codec/encode.rs @@ -0,0 +1,113 @@ +/// A simple wrapper around a `Vec<u8>` that provides methods for writing various types of data. +pub struct SketchBytes { + bytes: Vec<u8>, +} + +impl SketchBytes { + /// Constructs an empty `SketchBytes` with at least the specified capacity. + pub fn with_capacity(capacity: usize) -> Self { + Self { + bytes: Vec::with_capacity(capacity), + } + } + + /// Consumes the `SketchBytes` and returns the underlying `Vec<u8>`. + pub fn into_bytes(self) -> Vec<u8> { + self.bytes + } + + /// Writes the given byte slice to the `SketchBytes`. + pub fn write(&mut self, buf: &[u8]) { + self.bytes.extend_from_slice(buf); + } + + /// Writes a single byte to the `SketchBytes`. + pub fn write_u8(&mut self, n: u8) { + self.bytes.push(n); + } + + /// Writes a single byte to the `SketchBytes`. + pub fn write_i8(&mut self, n: i8) { + self.bytes.push(n as u8); + } + + /// Writes a 16-bit unsigned integer to the `SketchBytes` in little-endian byte order. + pub fn write_u16_le(&mut self, n: u16) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 16-bit unsigned integer to the `SketchBytes` in big-endian byte order. + pub fn write_u16_be(&mut self, n: u16) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 16-bit signed integer to the `SketchBytes` in little-endian byte order. + pub fn write_i16_le(&mut self, n: i16) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 16-bit signed integer to the `SketchBytes` in big-endian byte order. + pub fn write_i16_be(&mut self, n: i16) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 32-bit unsigned integer to the `SketchBytes` in little-endian byte order. + pub fn write_u32_le(&mut self, n: u32) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 32-bit unsigned integer to the `SketchBytes` in big-endian byte order. + pub fn write_u32_be(&mut self, n: u32) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 32-bit signed integer to the `SketchBytes` in little-endian byte order. + pub fn write_i32_le(&mut self, n: i32) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 32-bit signed integer to the `SketchBytes` in big-endian byte order. + pub fn write_i32_be(&mut self, n: i32) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 64-bit unsigned integer to the `SketchBytes` in little-endian byte order. + pub fn write_u64_le(&mut self, n: u64) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 64-bit unsigned integer to the `SketchBytes` in big-endian byte order. + pub fn write_u64_be(&mut self, n: u64) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 64-bit signed integer to the `SketchBytes` in little-endian byte order. + pub fn write_i64_le(&mut self, n: i64) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 64-bit signed integer to the `SketchBytes` in big-endian byte order. + pub fn write_i64_be(&mut self, n: i64) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 32-bit floating-point number to the `SketchBytes` in little-endian byte order. + pub fn write_f32_le(&mut self, n: f32) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 32-bit floating-point number to the `SketchBytes` in big-endian byte order. + pub fn write_f32_be(&mut self, n: f32) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 64-bit floating-point number to the `SketchBytes` in little-endian byte order. + pub fn write_f64_le(&mut self, n: f64) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 64-bit floating-point number to the `SketchBytes` in big-endian byte order. + pub fn write_f64_be(&mut self, n: f64) { + self.write(&n.to_be_bytes()); + } +} diff --git a/datasketches/src/codec/family.rs b/datasketches/src/codec/family.rs new file mode 100644 index 0000000..ab741a9 --- /dev/null +++ b/datasketches/src/codec/family.rs @@ -0,0 +1,64 @@ +/// Defines the various families of sketch and set operation classes. +/// +/// A family defines a set of classes that share fundamental algorithms and behaviors. The classes +/// within a family may still differ by how they are stored and accessed. +pub struct Family { + /// The byte ID for this family. + pub id: u8, + /// The name for this family. + pub name: &'static str, + /// The minimum preamble size for this family in longs (8-bytes integer). + pub min_pre_longs: u8, + /// The maximum preamble size for this family in longs (8-bytes integer). + pub max_pre_longs: u8, +} + +impl Family { + /// The HLL family of sketches. + pub const HLL: Family = Family { + id: 7, + name: "HLL", + min_pre_longs: 1, + max_pre_longs: 1, + }; + + /// The Frequency family of sketches. + pub const FREQUENCY: Family = Family { + id: 10, + name: "FREQUENCY", + min_pre_longs: 1, + max_pre_longs: 4, + }; + + /// Compressed Probabilistic Counting (CPC) Sketch. + pub const CPC: Family = Family { + id: 16, + name: "CPC", + min_pre_longs: 1, + max_pre_longs: 5, + }; + + /// CountMin Sketch + pub const COUNTMIN: Family = Family { + id: 17, + name: "COUNTMIN", + min_pre_longs: 2, + max_pre_longs: 2, + }; + + /// T-Digest for estimating quantiles and ranks. + pub const TDIGEST: Family = Family { + id: 20, + name: "TDIGEST", + min_pre_longs: 1, + max_pre_longs: 2, + }; + + /// Bloom Filter. + pub const BLOOMFILTER: Family = Family { + id: 24, + name: "BLOOMFILTER", + min_pre_longs: 3, + max_pre_longs: 4, + }; +} diff --git a/datasketches/src/codec/mod.rs b/datasketches/src/codec/mod.rs new file mode 100644 index 0000000..947d228 --- /dev/null +++ b/datasketches/src/codec/mod.rs @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Codec utilities for datasketches crate. + +// public common codec utilities for datasketches crate +mod encode; +mod decode; +pub use self::encode::SketchBytes; +pub use self::decode::SketchSlice; + +// private to datasketches crate +pub(crate) mod family; diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs index 17701ab..02dc692 100644 --- a/datasketches/src/lib.rs +++ b/datasketches/src/lib.rs @@ -31,6 +31,7 @@ compile_error!("datasketches does not support big-endian targets"); pub mod bloom; +pub mod codec; pub mod common; pub mod countmin; pub mod cpc; @@ -40,5 +41,4 @@ pub mod hll; pub mod tdigest; pub mod theta; -mod codec; mod hash; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
