Re: [PR] [Parquet]Optimize the performance in record reader [arrow-rs]

via GitHub Fri, 17 Oct 2025 13:53:34 -0700


alamb commented on code in PR #8607:
URL: https://github.com/apache/arrow-rs/pull/8607#discussion_r2441096924



##########
parquet/src/arrow/record_reader/definition_levels.rs:
##########
@@ -284,20 +280,31 @@ impl PackedDecoder {
         self.data_offset = 0;
     }
 
-    fn read(&mut self, buffer: &mut BooleanBufferBuilder, len: usize) -> 
Result<usize> {
-        let mut read = 0;
-        while read != len {
+    /// Reads up to `len` definition levels directly into a boolean bitmask.
+    ///
+    /// Returns a tuple of `(values_read, levels_read)`, where `values_read` 
counts the
+    /// number of `true` bits appended to `buffer`.
+    fn read(&mut self, buffer: &mut BooleanBufferBuilder, len: usize) -> 
Result<(usize, usize)> {

Review Comment:
   @jhorstmann  or @etseidl  -- do you have any time to review this portion of 
the change more carefully?



##########
parquet/src/column/reader/decoder.rs:
##########
@@ -136,14 +134,75 @@ pub trait ColumnValueDecoder {
     fn skip_values(&mut self, num_values: usize) -> Result<usize>;
 }
 
+/// Bucket-based storage for decoder instances keyed by `Encoding`.
+///
+/// This replaces `HashMap` lookups with direct indexing to avoid hashing 
overhead in the
+/// hot decoding paths.
+const ENCODING_SLOTS: usize = 10; // covers the encodings handled in `enc_slot`
+
+#[inline]
+fn enc_slot(e: Encoding) -> usize {
+    match e {
+        Encoding::PLAIN => 0,
+        Encoding::PLAIN_DICTIONARY => 2,
+        Encoding::RLE => 3,
+        #[allow(deprecated)]
+        Encoding::BIT_PACKED => 4,
+        Encoding::DELTA_BINARY_PACKED => 5,
+        Encoding::DELTA_LENGTH_BYTE_ARRAY => 6,
+        Encoding::DELTA_BYTE_ARRAY => 7,
+        Encoding::RLE_DICTIONARY => 8,
+        Encoding::BYTE_STREAM_SPLIT => 9,
+    }
+}
+
+/// Fixed-capacity storage for decoder instances keyed by Parquet encoding.
+struct DecoderBuckets<V> {
+    inner: [Option<V>; ENCODING_SLOTS],
+}
+
+impl<V> DecoderBuckets<V> {
+    #[inline]
+    fn new() -> Self {
+        Self {
+            inner: std::array::from_fn(|_| None),
+        }
+    }
+
+    #[inline]
+    fn contains_key(&self, e: Encoding) -> bool {
+        self.inner[enc_slot(e)].is_some()
+    }
+
+    #[inline]
+    fn get_mut(&mut self, e: Encoding) -> Option<&mut V> {
+        self.inner[enc_slot(e)].as_mut()
+    }
+
+    #[inline]
+    fn insert_and_get_mut(&mut self, e: Encoding, v: V) -> &mut V {
+        let slot = &mut self.inner[enc_slot(e)];
+        debug_assert!(slot.is_none());
+        *slot = Some(v);
+        slot.as_mut().unwrap()
+    }
+}
+
+impl<V> Default for DecoderBuckets<V> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 /// An implementation of [`ColumnValueDecoder`] for `[T::T]`
 pub struct ColumnValueDecoderImpl<T: DataType> {
     descr: ColumnDescPtr,
 
     current_encoding: Option<Encoding>,
 
-    // Cache of decoders for existing encodings
-    decoders: HashMap<Encoding, Box<dyn Decoder<T>>>,
+    /// Cache of decoders for existing encodings.

Review Comment:
   this is a nice improvement



##########
parquet/src/arrow/buffer/bit_util.rs:
##########
@@ -18,6 +18,7 @@
 use arrow_buffer::bit_chunk_iterator::UnalignedBitChunk;
 use std::ops::Range;
 
+#[allow(unused)]

Review Comment:
   If this is no longer used, I think we should remove it (rather than allow 
unused)
   
   The fact that it is `pub` is misleading I think -- as it isn't publically 
exported



##########
parquet/src/column/reader/decoder.rs:
##########
@@ -136,14 +134,75 @@ pub trait ColumnValueDecoder {
     fn skip_values(&mut self, num_values: usize) -> Result<usize>;
 }
 
+/// Bucket-based storage for decoder instances keyed by `Encoding`.
+///
+/// This replaces `HashMap` lookups with direct indexing to avoid hashing 
overhead in the
+/// hot decoding paths.
+const ENCODING_SLOTS: usize = 10; // covers the encodings handled in `enc_slot`
+
+#[inline]
+fn enc_slot(e: Encoding) -> usize {

Review Comment:
   FYI @etseidl  -- this is very similar to what you did with the `Encoding` in 
metadata (with `EncodingMask`): 
https://arrow.apache.org/rust/parquet/basic/struct.EncodingMask.html
   
   @hhhizzz  we maybe could use `EncodingMask` here
   
   
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [Parquet]Optimize the performance in record reader [arrow-rs]

Reply via email to