fresh-borzoni commented on code in PR #156: URL: https://github.com/apache/fluss-rust/pull/156#discussion_r2686336887
########## crates/fluss/src/util/varint.rs: ########## @@ -0,0 +1,446 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Variable-length integer encoding utilities. +//! +//! This module provides utilities for encoding integers in variable-length format, +//! which can save space when encoding small integers. The encoding uses 7 bits per byte +//! with the most significant bit as a continuation flag. + +use bytes::BufMut; +use std::io::{self, Read, Write}; + +/// Write an unsigned integer in variable-length format. +/// +/// The encoding uses 7 bits per byte with the MSB set to 1 if more bytes follow. +/// This matches the encoding used in Google Protocol Buffers. +pub fn write_unsigned_varint<W: Write>(value: u32, writer: &mut W) -> io::Result<usize> { + let mut v = value; + let mut bytes_written = 0; + + while (v & !0x7F) != 0 { + writer.write_all(&[((v as u8) & 0x7F) | 0x80])?; + bytes_written += 1; + v >>= 7; + } + writer.write_all(&[v as u8])?; + bytes_written += 1; + + Ok(bytes_written) +} + +/// Write an unsigned integer in variable-length format to a buffer. +pub fn write_unsigned_varint_buf(value: u32, buf: &mut impl BufMut) { + let mut v = value; + + while (v & !0x7F) != 0 { + buf.put_u8(((v as u8) & 0x7F) | 0x80); + v >>= 7; + } + buf.put_u8(v as u8); +} + +/// Read an unsigned integer stored in variable-length format. +#[allow(dead_code)] +pub fn read_unsigned_varint<R: Read>(reader: &mut R) -> io::Result<u32> { + let mut tmp = [0u8; 1]; + reader.read_exact(&mut tmp)?; + let mut byte = tmp[0] as i8; + + if byte >= 0 { + return Ok(byte as u32); + } + + let mut result = (byte & 127) as u32; + + reader.read_exact(&mut tmp)?; + byte = tmp[0] as i8; + if byte >= 0 { + result |= (byte as u32) << 7; + } else { + result |= ((byte & 127) as u32) << 7; + + reader.read_exact(&mut tmp)?; + byte = tmp[0] as i8; + if byte >= 0 { + result |= (byte as u32) << 14; + } else { + result |= ((byte & 127) as u32) << 14; + + reader.read_exact(&mut tmp)?; + byte = tmp[0] as i8; + if byte >= 0 { + result |= (byte as u32) << 21; + } else { + result |= ((byte & 127) as u32) << 21; + + reader.read_exact(&mut tmp)?; + byte = tmp[0] as i8; + result |= (byte as u32) << 28; + + if byte < 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "VarInt is too long, the most significant bit in the 5th byte is set, converted value: {:#x}", + result + ), + )); + } + } + } + } + + Ok(result) +} + +/// Read an unsigned integer from a byte slice in variable-length format. +pub fn read_unsigned_varint_bytes(bytes: &[u8]) -> io::Result<(u32, usize)> { + if bytes.is_empty() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Cannot read varint from empty buffer", + )); + } + + let mut byte = bytes[0] as i8; + let mut index = 1; + + if byte >= 0 { + return Ok((byte as u32, index)); + } + + let mut result = (byte & 127) as u32; + + if index >= bytes.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Incomplete varint", + )); + } + byte = bytes[index] as i8; + index += 1; + if byte >= 0 { + result |= (byte as u32) << 7; + } else { + result |= ((byte & 127) as u32) << 7; + + if index >= bytes.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Incomplete varint", + )); + } + byte = bytes[index] as i8; + index += 1; + if byte >= 0 { + result |= (byte as u32) << 14; + } else { + result |= ((byte & 127) as u32) << 14; + + if index >= bytes.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Incomplete varint", + )); + } + byte = bytes[index] as i8; + index += 1; + if byte >= 0 { + result |= (byte as u32) << 21; + } else { + result |= ((byte & 127) as u32) << 21; + + if index >= bytes.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Incomplete varint", + )); + } + byte = bytes[index] as i8; + index += 1; + result |= (byte as u32) << 28; + + if byte < 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "VarInt is too long, the most significant bit in the 5th byte is set, converted value: {:#x}", + result + ), + )); + } + } + } + } + + Ok((result, index)) +} + +/// Calculate the number of bytes needed to encode an unsigned integer in variable-length format. +pub fn size_of_unsigned_varint(value: u32) -> usize { + let leading_zeros = value.leading_zeros(); + let leading_zeros_below_38_divided_by_7 = ((38 - leading_zeros) * 0b10010010010010011) >> 19; + (leading_zeros_below_38_divided_by_7 + (leading_zeros >> 5)) as usize +} + +/// Write an unsigned 64-bit integer in variable-length format to a buffer. +#[allow(dead_code)] +pub fn write_unsigned_varint_u64_buf(value: u64, buf: &mut impl BufMut) { + let mut v = value; + while (v & !0x7F) != 0 { + buf.put_u8(((v as u8) & 0x7F) | 0x80); + v >>= 7; + } + buf.put_u8(v as u8); +} + +/// Write directly to a mutable byte slice, returning the number of bytes written. +/// Used by CompactedRowWriter which manages its own position. +pub fn write_unsigned_varint_to_slice(value: u32, slice: &mut [u8]) -> usize { + let mut v = value; + let mut written = 0; + + while (v & !0x7F) != 0 { + slice[written] = ((v as u8) & 0x7F) | 0x80; + written += 1; + v >>= 7; + } + slice[written] = v as u8; + written + 1 +} + +/// Write unsigned 64-bit varint directly to a mutable byte slice. +pub fn write_unsigned_varint_u64_to_slice(value: u64, slice: &mut [u8]) -> usize { + let mut v = value; + let mut written = 0; + + while (v & !0x7F) != 0 { + slice[written] = ((v as u8) & 0x7F) | 0x80; + written += 1; + v >>= 7; + } + slice[written] = v as u8; + written + 1 +} Review Comment: it's on hot path, so better to leave as it is and don't trade performance here -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
