This is an automated email from the ASF dual-hosted git repository. sunchao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push: new b2de544 parquet: Speed up `BitReader`/`DeltaBitPackDecoder` (#325) b2de544 is described below commit b2de5446cc1e45a0559fb39039d0545df1ac0d26 Author: Kornelijus Survila <kornhol...@gmail.com> AuthorDate: Sun May 23 19:00:42 2021 -0600 parquet: Speed up `BitReader`/`DeltaBitPackDecoder` (#325) * parquet: Avoid temporary `BufferPtr`s in `BitReader` From a quick test, this speeds up reading delta-packed int columns by over 30%. * parquet: Avoid some allocations in `DeltaBitPackDecoder` From a quick test, it seems to decode around 10% faster overall. --- parquet/src/encodings/decoding.rs | 7 +++---- parquet/src/util/bit_util.rs | 13 +++---------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index b73ebf0..e83e277 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -395,16 +395,15 @@ impl<T: DataType> DeltaBitPackDecoder<T> { .get_zigzag_vlq_int() .ok_or_else(|| eof_err!("Not enough data to decode 'min_delta'"))?; - let mut widths = vec![]; + self.delta_bit_widths.clear(); for _ in 0..self.num_mini_blocks { let w = self .bit_reader .get_aligned::<u8>(1) .ok_or_else(|| eof_err!("Not enough data to decode 'width'"))?; - widths.push(w); + self.delta_bit_widths.push(w); } - self.delta_bit_widths.set_data(widths); self.mini_block_idx = 0; self.delta_bit_width = self.delta_bit_widths.data()[0]; self.values_current_mini_block = self.values_per_mini_block; @@ -417,7 +416,6 @@ impl<T: DataType> DeltaBitPackDecoder<T> { where T::T: FromBytes, { - self.deltas_in_mini_block.clear(); if self.use_batch { self.deltas_in_mini_block .resize(self.values_current_mini_block, T::T::default()); @@ -427,6 +425,7 @@ impl<T: DataType> DeltaBitPackDecoder<T> { ); assert!(loaded == self.values_current_mini_block); } else { + self.deltas_in_mini_block.clear(); for _ in 0..self.values_current_mini_block { // TODO: load one batch at a time similar to int32 let delta = self diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index 677b669..8dfb631 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -603,11 +603,7 @@ impl BitReader { // Advance byte_offset to next unread byte and read num_bytes self.byte_offset += bytes_read; - let v = read_num_bytes!( - T, - num_bytes, - self.buffer.start_from(self.byte_offset).as_ref() - ); + let v = read_num_bytes!(T, num_bytes, self.buffer.data()[self.byte_offset..]); self.byte_offset += num_bytes; // Reset buffered_values @@ -657,11 +653,8 @@ impl BitReader { fn reload_buffer_values(&mut self) { let bytes_to_read = cmp::min(self.total_bytes - self.byte_offset, 8); - self.buffered_values = read_num_bytes!( - u64, - bytes_to_read, - self.buffer.start_from(self.byte_offset).as_ref() - ); + self.buffered_values = + read_num_bytes!(u64, bytes_to_read, self.buffer.data()[self.byte_offset..]); } }