This is an automated email from the ASF dual-hosted git repository.

sunchao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new b2de544  parquet: Speed up `BitReader`/`DeltaBitPackDecoder` (#325)
b2de544 is described below

commit b2de5446cc1e45a0559fb39039d0545df1ac0d26
Author: Kornelijus Survila <kornhol...@gmail.com>
AuthorDate: Sun May 23 19:00:42 2021 -0600

    parquet: Speed up `BitReader`/`DeltaBitPackDecoder` (#325)
    
    * parquet: Avoid temporary `BufferPtr`s in `BitReader`
    
    From a quick test, this speeds up reading delta-packed int columns by
    over 30%.
    
    * parquet: Avoid some allocations in `DeltaBitPackDecoder`
    
    From a quick test, it seems to decode around 10% faster overall.
---
 parquet/src/encodings/decoding.rs |  7 +++----
 parquet/src/util/bit_util.rs      | 13 +++----------
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/parquet/src/encodings/decoding.rs 
b/parquet/src/encodings/decoding.rs
index b73ebf0..e83e277 100644
--- a/parquet/src/encodings/decoding.rs
+++ b/parquet/src/encodings/decoding.rs
@@ -395,16 +395,15 @@ impl<T: DataType> DeltaBitPackDecoder<T> {
             .get_zigzag_vlq_int()
             .ok_or_else(|| eof_err!("Not enough data to decode 'min_delta'"))?;
 
-        let mut widths = vec![];
+        self.delta_bit_widths.clear();
         for _ in 0..self.num_mini_blocks {
             let w = self
                 .bit_reader
                 .get_aligned::<u8>(1)
                 .ok_or_else(|| eof_err!("Not enough data to decode 'width'"))?;
-            widths.push(w);
+            self.delta_bit_widths.push(w);
         }
 
-        self.delta_bit_widths.set_data(widths);
         self.mini_block_idx = 0;
         self.delta_bit_width = self.delta_bit_widths.data()[0];
         self.values_current_mini_block = self.values_per_mini_block;
@@ -417,7 +416,6 @@ impl<T: DataType> DeltaBitPackDecoder<T> {
     where
         T::T: FromBytes,
     {
-        self.deltas_in_mini_block.clear();
         if self.use_batch {
             self.deltas_in_mini_block
                 .resize(self.values_current_mini_block, T::T::default());
@@ -427,6 +425,7 @@ impl<T: DataType> DeltaBitPackDecoder<T> {
             );
             assert!(loaded == self.values_current_mini_block);
         } else {
+            self.deltas_in_mini_block.clear();
             for _ in 0..self.values_current_mini_block {
                 // TODO: load one batch at a time similar to int32
                 let delta = self
diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs
index 677b669..8dfb631 100644
--- a/parquet/src/util/bit_util.rs
+++ b/parquet/src/util/bit_util.rs
@@ -603,11 +603,7 @@ impl BitReader {
 
         // Advance byte_offset to next unread byte and read num_bytes
         self.byte_offset += bytes_read;
-        let v = read_num_bytes!(
-            T,
-            num_bytes,
-            self.buffer.start_from(self.byte_offset).as_ref()
-        );
+        let v = read_num_bytes!(T, num_bytes, 
self.buffer.data()[self.byte_offset..]);
         self.byte_offset += num_bytes;
 
         // Reset buffered_values
@@ -657,11 +653,8 @@ impl BitReader {
 
     fn reload_buffer_values(&mut self) {
         let bytes_to_read = cmp::min(self.total_bytes - self.byte_offset, 8);
-        self.buffered_values = read_num_bytes!(
-            u64,
-            bytes_to_read,
-            self.buffer.start_from(self.byte_offset).as_ref()
-        );
+        self.buffered_values =
+            read_num_bytes!(u64, bytes_to_read, 
self.buffer.data()[self.byte_offset..]);
     }
 }
 

Reply via email to