[GitHub] [arrow-rs] alamb commented on a diff in pull request #2231: Automatically grow parquet BitWriter (#2226) (~10% faster)

GitBox Sun, 31 Jul 2022 04:33:54 -0700


alamb commented on code in PR #2231:
URL: https://github.com/apache/arrow-rs/pull/2231#discussion_r933969991



##########
parquet/src/encodings/encoding/mod.rs:
##########
@@ -220,25 +223,18 @@ impl<T: DataType> Encoder<T> for RleValueEncoder<T> {
         ensure_phys_ty!(Type::BOOLEAN, "RleValueEncoder only supports 
BoolType");
         let rle_encoder = self
             .encoder
-            .as_mut()
+            .take()
             .expect("RLE value encoder is not initialized");
 
         // Flush all encoder buffers and raw values
-        let encoded_data = {
-            let buf = rle_encoder.flush_buffer()?;
-
-            // Note that buf does not have any offset, all data is encoded 
bytes
-            let len = (buf.len() as i32).to_le();
-            let len_bytes = len.as_bytes();
-            let mut encoded_data = vec![];
-            encoded_data.extend_from_slice(len_bytes);

Review Comment:
   Is this part of the speed increase -- avoid copying encoded bytes and 
instead reserve the space for the length up front and updated it at the end?



##########
parquet/src/encodings/encoding/dict_encoder.rs:
##########
@@ -127,12 +126,11 @@ impl<T: DataType> DictEncoder<T> {
     /// the result.
     pub fn write_indices(&mut self) -> Result<ByteBufferPtr> {
         let buffer_len = self.estimated_data_encoded_size();
-        let mut buffer = vec![0; buffer_len];
-        buffer[0] = self.bit_width() as u8;
+        let mut buffer = Vec::with_capacity(buffer_len);

Review Comment:
   👍 



##########
parquet/src/encodings/levels.rs:
##########
@@ -65,32 +65,32 @@ impl LevelEncoder {
     /// Used to encode levels for Data Page v1.
     ///
     /// Panics, if encoding is not supported.
-    pub fn v1(encoding: Encoding, max_level: i16, byte_buffer: Vec<u8>) -> 
Self {
+    pub fn v1(encoding: Encoding, max_level: i16, capacity: usize) -> Self {
+        let capacity_bytes = max_buffer_size(encoding, max_level, capacity);
+        let mut buffer = Vec::with_capacity(capacity_bytes);
         let bit_width = num_required_bits(max_level as u64);
         match encoding {
-            Encoding::RLE => LevelEncoder::Rle(RleEncoder::new_from_buf(
-                bit_width,
-                byte_buffer,
-                mem::size_of::<i32>(),
-            )),
+            Encoding::RLE => {
+                buffer.extend_from_slice(&[0; 8]);

Review Comment:
   ```suggestion
                   // reserve space for length
                   buffer.extend_from_slice(&[0; 8]);
   ```
   
   Is that what this initial 0 is for?



##########
parquet/src/util/bit_util.rs:
##########
@@ -241,86 +213,62 @@ impl BitWriter {
     #[inline]
     pub fn flush_buffer(&mut self) -> &[u8] {
         self.flush();
-        &self.buffer()[0..self.byte_offset]
+        self.buffer()
     }
 
     /// Clears the internal state so the buffer can be reused.
     #[inline]
     pub fn clear(&mut self) {
+        self.buffer.clear();
         self.buffered_values = 0;
-        self.byte_offset = self.start;
         self.bit_offset = 0;
     }
 
     /// Flushes the internal buffered bits and the align the buffer to the 
next byte.
     #[inline]
     pub fn flush(&mut self) {
-        let num_bytes = ceil(self.bit_offset as i64, 8) as usize;
-        assert!(self.byte_offset + num_bytes <= self.max_bytes);
-        memcpy_value(
-            &self.buffered_values,
-            num_bytes,
-            &mut self.buffer[self.byte_offset..],
-        );
+        let num_bytes = ceil(self.bit_offset, 8);
+        let slice = &self.buffered_values.to_ne_bytes()[..num_bytes as usize];

Review Comment:
   I double checked that `to_ne_bytes()` does the same as `memcpy_value` in 
terms of byte order. I think it does



##########
parquet/src/encodings/levels.rs:
##########
@@ -391,33 +386,6 @@ mod tests {
         assert_eq!(buffer[0..num_decoded], levels[0..num_decoded]);
     }
 
-    // Tests when encoded values are larger than encoder's buffer

Review Comment:
   Why are these tests removed? Is it because there is no way to "run out of 
space" anymore? 



##########
parquet/src/util/bit_util.rs:
##########
@@ -973,24 +887,23 @@ mod tests {
     #[test]
     fn test_skip() {
         let mut writer = BitWriter::new(5);
-        let old_offset = writer.skip(1).expect("skip() should return OK");
+        let old_offset = writer.skip(1);
         writer.put_aligned(42, 4);
         writer.put_aligned_offset(0x10, 1, old_offset);
         let result = writer.consume();
         assert_eq!(result.as_ref(), [0x10, 42, 0, 0, 0]);
 
         writer = BitWriter::new(4);
         let result = writer.skip(5);
-        assert!(result.is_err());
+        assert_eq!(result, 0);

Review Comment:
   this is ok now because the buffer automatically grows, right?



##########
parquet/src/util/bit_util.rs:
##########
@@ -846,16 +760,16 @@ mod tests {
     fn test_bit_reader_skip() {
         let buffer = vec![255, 0];
         let mut bit_reader = BitReader::from(buffer);
-        let skipped = bit_reader.skip(1,1);
+        let skipped = bit_reader.skip(1, 1);

Review Comment:
   it is strange that `cargo fmt` seems to have started caring about this file 
🤷 



##########
parquet/src/util/bit_util.rs:
##########
@@ -330,103 +278,71 @@ impl BitWriter {
 
     /// Writes the `num_bits` LSB of value `v` to the internal buffer of this 
writer.
     /// The `num_bits` must not be greater than 64. This is bit packed.
-    ///
-    /// Returns false if there's not enough room left. True otherwise.
     #[inline]
-    pub fn put_value(&mut self, v: u64, num_bits: usize) -> bool {
+    pub fn put_value(&mut self, v: u64, num_bits: usize) {
         assert!(num_bits <= 64);
+        let num_bits = num_bits as u8;
         assert_eq!(v.checked_shr(num_bits as u32).unwrap_or(0), 0); // covers 
case v >> 64
 
-        if self.byte_offset * 8 + self.bit_offset + num_bits > self.max_bytes 
as usize * 8
-        {
-            return false;
-        }
-
+        // Add value to buffered_values
         self.buffered_values |= v << self.bit_offset;
         self.bit_offset += num_bits;
-        if self.bit_offset >= 64 {
-            memcpy_value(
-                &self.buffered_values,
-                8,
-                &mut self.buffer[self.byte_offset..],
-            );
-            self.byte_offset += 8;
-            self.bit_offset -= 64;
-            self.buffered_values = 0;
+        if let Some(remaining) = self.bit_offset.checked_sub(64) {
+            self.buffer
+                .extend_from_slice(&self.buffered_values.to_le_bytes());

Review Comment:
   Why does this path use `to_le_bytes` and the path above use `to_ne_bytes`? 
Maybe they should all be `to_le_bytes` for consistency (and portability?)



##########
parquet/src/encodings/levels.rs:
##########
@@ -114,9 +114,7 @@ impl LevelEncoder {
             }
             LevelEncoder::BitPacked(bit_width, ref mut encoder) => {
                 for value in buffer {
-                    if !encoder.put_value(*value as u64, bit_width as usize) {
-                        return Err(general_err!("Not enough bytes left"));
-                    }
+                    encoder.put_value(*value as u64, bit_width as usize);

Review Comment:
   🤔  I wonder if removing an error check off the hot path also increases 
performance 



##########
parquet/src/encodings/encoding/mod.rs:
##########
@@ -873,7 +869,7 @@ mod tests {
         let mut values = vec![];
         values.extend_from_slice(&[true; 16]);
         values.extend_from_slice(&[false; 16]);
-        run_test::<BoolType>(Encoding::RLE, -1, &values, 0, 2, 0);
+        run_test::<BoolType>(Encoding::RLE, -1, &values, 0, 6, 0);

Review Comment:
   why did this change from `2` to `6`?



##########
parquet/src/encodings/encoding/mod.rs:
##########
@@ -186,10 +186,13 @@ impl<T: DataType> Encoder<T> for RleValueEncoder<T> {
     fn put(&mut self, values: &[T::T]) -> Result<()> {
         ensure_phys_ty!(Type::BOOLEAN, "RleValueEncoder only supports 
BoolType");
 
-        if self.encoder.is_none() {
-            self.encoder = Some(RleEncoder::new(1, DEFAULT_RLE_BUFFER_LEN));
-        }
-        let rle_encoder = self.encoder.as_mut().unwrap();
+        let rle_encoder = self.encoder.get_or_insert_with(|| {

Review Comment:
   TIL: 
https://doc.rust-lang.org/std/option/enum.Option.html#method.get_or_insert_with 
👍 



##########
parquet/src/util/bit_util.rs:
##########
@@ -180,59 +179,32 @@ pub fn get_bit(data: &[u8], i: usize) -> bool {
 /// bit packed or byte aligned fashion.
 pub struct BitWriter {
     buffer: Vec<u8>,
-    max_bytes: usize,
     buffered_values: u64,
-    byte_offset: usize,
-    bit_offset: usize,
-    start: usize,
+    bit_offset: u8,
 }
 
 impl BitWriter {
     pub fn new(max_bytes: usize) -> Self {
         Self {
-            buffer: vec![0; max_bytes],
-            max_bytes,
+            buffer: Vec::with_capacity(max_bytes),

Review Comment:
   Also, as I reviewed the rest of this PR, it may means that the hotpath has 
at least one fewer error checks



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-rs] alamb commented on a diff in pull request #2231: Automatically grow parquet BitWriter (#2226) (~10% faster)

Reply via email to