tustvold commented on a change in pull request #1283:
URL: https://github.com/apache/arrow-rs/pull/1283#discussion_r800565191
##########
File path: parquet/src/util/bit_util.rs
##########
@@ -445,158 +445,307 @@ impl BitWriter {
/// MAX_VLQ_BYTE_LEN = 5 for i32, and MAX_VLQ_BYTE_LEN = 10 for i64
pub const MAX_VLQ_BYTE_LEN: usize = 10;
-pub struct BitReader {
- // The byte buffer to read from, passed in by client
+/// A struct storing the state for reading individual bits from a byte array
+struct BitReaderUnaligned {
+ /// The byte buffer to read from, passed in by client
buffer: ByteBufferPtr,
- // Bytes are memcpy'd from `buffer` and values are read from this variable.
- // This is faster than reading values byte by byte directly from `buffer`
- buffered_values: u64,
-
- //
- // End Start
- // |............|B|B|B|B|B|B|B|B|..............|
- // ^ ^
- // bit_offset byte_offset
- //
- // Current byte offset in `buffer`
+ ///
+ /// End Start
+ /// |............|B|B|B|B|B|B|B|B|..............|
+ /// ^ ^
+ /// bit_offset byte_offset
+ ///
+ /// Current byte offset in `buffer`
byte_offset: usize,
- // Current bit offset in `buffered_values`
+ /// Current bit offset in `buffered_values`
bit_offset: usize,
- // Total number of bytes in `buffer`
- total_bytes: usize,
+ /// Bytes are memcpy'd from `buffer` and values are read from this
variable.
+ /// This is faster than reading values byte by byte directly from `buffer`
+ buffered_values: u64,
}
-/// Utility class to read bit/byte stream. This class can read bits or bytes
that are
-/// either byte aligned or not.
-impl BitReader {
- pub fn new(buffer: ByteBufferPtr) -> Self {
- let total_bytes = buffer.len();
- let num_bytes = cmp::min(8, total_bytes);
- let buffered_values = read_num_bytes!(u64, num_bytes, buffer.as_ref());
- BitReader {
- buffer,
- buffered_values,
- byte_offset: 0,
- bit_offset: 0,
- total_bytes,
- }
- }
-
- pub fn reset(&mut self, buffer: ByteBufferPtr) {
- self.buffer = buffer;
- self.total_bytes = self.buffer.len();
- let num_bytes = cmp::min(8, self.total_bytes);
- self.buffered_values = read_num_bytes!(u64, num_bytes,
self.buffer.as_ref());
- self.byte_offset = 0;
- self.bit_offset = 0;
- }
-
- /// Gets the current byte offset
- #[inline]
- pub fn get_byte_offset(&self) -> usize {
- self.byte_offset + ceil(self.bit_offset as i64, 8) as usize
+impl BitReaderUnaligned {
+ fn reload_buffer_values(&mut self) {
+ let bytes_to_read = cmp::min(self.buffer.len() - self.byte_offset, 8);
+ self.buffered_values =
+ read_num_bytes!(u64, bytes_to_read,
self.buffer.data()[self.byte_offset..]);
}
- /// Reads a value of type `T` and of size `num_bits`.
- ///
- /// Returns `None` if there's not enough data available. `Some` otherwise.
- pub fn get_value<T: FromBytes>(&mut self, num_bits: usize) -> Option<T> {
- assert!(num_bits <= 64);
- assert!(num_bits <= size_of::<T>() * 8);
+ fn get<T: FromBytes>(&mut self, bit_width: usize) -> Option<T> {
+ assert!(bit_width <= 64);
+ assert!(bit_width <= size_of::<T>() * 8);
- if self.byte_offset * 8 + self.bit_offset + num_bits >
self.total_bytes * 8 {
+ if self.byte_offset * 8 + self.bit_offset + bit_width >
self.buffer.len() * 8 {
return None;
}
- let mut v = trailing_bits(self.buffered_values, self.bit_offset +
num_bits)
+ let mut v = trailing_bits(self.buffered_values, self.bit_offset +
bit_width)
>> self.bit_offset;
- self.bit_offset += num_bits;
+ self.bit_offset += bit_width;
if self.bit_offset >= 64 {
self.byte_offset += 8;
self.bit_offset -= 64;
self.reload_buffer_values();
v |= trailing_bits(self.buffered_values, self.bit_offset)
- .wrapping_shl((num_bits - self.bit_offset) as u32);
+ .wrapping_shl((bit_width - self.bit_offset) as u32);
}
// TODO: better to avoid copying here
Some(from_ne_slice(v.as_bytes()))
}
- pub fn get_batch<T: FromBytes>(&mut self, batch: &mut [T], num_bits:
usize) -> usize {
- assert!(num_bits <= 32);
- assert!(num_bits <= size_of::<T>() * 8);
+ /// Gets the current byte offset
+ fn aligned_byte_offset(&self) -> usize {
+ self.byte_offset + ceil(self.bit_offset as i64, 8) as usize
+ }
+}
- let mut values_to_read = batch.len();
- let needed_bits = num_bits * values_to_read;
- let remaining_bits = (self.total_bytes - self.byte_offset) * 8 -
self.bit_offset;
- if remaining_bits < needed_bits {
- values_to_read = remaining_bits / num_bits;
+/// A struct for storing the state for reading whole bytes from a byte stream
+struct BitReaderAligned {
+ /// The byte buffer to read from, passed in by client
+ buffer: ByteBufferPtr,
+ /// The current offset in `buffer`
+ byte_offset: usize,
+}
+
+impl BitReaderAligned {
+ fn get<T: FromBytes>(&mut self, num_bytes: usize) -> Option<T> {
+ if self.byte_offset + num_bytes > self.buffer.len() {
+ return None;
}
- let mut i = 0;
+ let v = read_num_bytes!(T, num_bytes,
self.buffer.data()[self.byte_offset..]);
+ self.byte_offset += num_bytes;
- // First align bit offset to byte offset
- if self.bit_offset != 0 {
- while i < values_to_read && self.bit_offset != 0 {
- batch[i] = self
- .get_value(num_bits)
- .expect("expected to have more data");
- i += 1;
- }
- }
+ Some(v)
+ }
- unsafe {
- let in_buf = &self.buffer.data()[self.byte_offset..];
- let mut in_ptr = in_buf as *const [u8] as *const u8 as *const u32;
- if size_of::<T>() == 4 {
- while values_to_read - i >= 32 {
- let out_ptr = &mut batch[i..] as *mut [T] as *mut T as
*mut u32;
- in_ptr = unpack32(in_ptr, out_ptr, num_bits);
- self.byte_offset += 4 * num_bits;
- i += 32;
- }
- } else {
- let mut out_buf = [0u32; 32];
- let out_ptr = &mut out_buf as &mut [u32] as *mut [u32] as *mut
u32;
- while values_to_read - i >= 32 {
- in_ptr = unpack32(in_ptr, out_ptr, num_bits);
- self.byte_offset += 4 * num_bits;
- for n in 0..32 {
- // We need to copy from smaller size to bigger size to
avoid
- // overwriting other memory regions.
- if size_of::<T>() > size_of::<u32>() {
+ /// Read up to `to_read` values from a packed buffer `batch` with bit
width `num_bits`
+ /// in batches of 32, returning the number of values read
+ ///
+ /// # Panics
+ ///
+ /// This function panics if
+ /// * `bit_width` is greater than 32
+ /// * less than `to_read` values in the buffer
+ fn get_batch_x32<T: FromBytes>(
+ &mut self,
+ batch: &mut [T],
+ to_read: usize,
+ bit_width: usize,
+ ) -> usize {
+ assert!(bit_width <= 32);
+
+ let mut values_read = 0;
+ let in_buf = &self.buffer.data()[self.byte_offset..];
+ assert!(in_buf.len() * 8 >= to_read * bit_width);
+
+ let mut in_ptr = in_buf as *const [u8] as *const u8 as *const u32;
Review comment:
This block is simply moved, with some slightly more verbose naming
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]