[GitHub] [arrow-rs] yordan-pavlov commented on a change in pull request #1041: Generify ColumnReaderImpl and RecordReader (#1040)

GitBox Thu, 23 Dec 2021 15:50:44 -0800


yordan-pavlov commented on a change in pull request #1041:
URL: https://github.com/apache/arrow-rs/pull/1041#discussion_r774814520




##########
File path: parquet/src/column/reader/decoder.rs
##########
@@ -0,0 +1,215 @@
+use std::collections::HashMap;
+use std::ops::Range;
+
+use crate::basic::Encoding;
+use crate::data_type::DataType;
+use crate::decoding::{get_decoder, Decoder, DictDecoder, PlainDecoder};
+use crate::encodings::rle::RleDecoder;
+use crate::errors::{ParquetError, Result};
+use crate::memory::ByteBufferPtr;
+use crate::schema::types::ColumnDescPtr;
+use crate::util::bit_util::BitReader;
+
+/// A type that can have level data written to it by a [`ColumnLevelDecoder`]
+pub trait LevelsWriter {
+    fn capacity(&self) -> usize;
+
+    fn get(&self, idx: usize) -> i16;
+
+    fn count_nulls(&self, range: Range<usize>, max_level: i16) -> usize;
+}
+
+impl LevelsWriter for [i16] {
+    fn capacity(&self) -> usize {
+        self.len()
+    }
+
+    fn get(&self, idx: usize) -> i16 {
+        self[idx]
+    }
+
+    fn count_nulls(&self, range: Range<usize>, max_level: i16) -> usize {
+        self[range].iter().filter(|i| **i != max_level).count()
+    }
+}
+
+/// A type that can have value data written to it by a [`ColumnValueDecoder`]
+pub trait ValuesWriter {
+    fn capacity(&self) -> usize;
+}
+
+impl<T> ValuesWriter for [T] {
+    fn capacity(&self) -> usize {
+        self.len()
+    }
+}
+
+/// Decodes level data to a [`LevelsWriter`]
+pub trait ColumnLevelDecoder {
+    type Writer: LevelsWriter + ?Sized;
+
+    fn create(max_level: i16, encoding: Encoding, data: ByteBufferPtr) -> Self;
+
+    fn read(&mut self, out: &mut Self::Writer, range: Range<usize>) -> 
Result<usize>;
+}
+
+/// Decodes value data to a [`ValuesWriter`]
+pub trait ColumnValueDecoder {
+    type Writer: ValuesWriter + ?Sized;
+
+    fn create(col: &ColumnDescPtr) -> Self;
+
+    fn set_dict(
+        &mut self,
+        buf: ByteBufferPtr,
+        num_values: u32,
+        encoding: Encoding,
+        is_sorted: bool,
+    ) -> Result<()>;
+
+    fn set_data(
+        &mut self,
+        encoding: Encoding,
+        data: ByteBufferPtr,
+        num_values: usize,
+    ) -> Result<()>;
+
+    fn read(&mut self, out: &mut Self::Writer, range: Range<usize>) -> 
Result<usize>;
+}
+
+/// An implementation of [`ColumnValueDecoder`] for `[T::T]`
+pub struct ColumnValueDecoderImpl<T: DataType> {
+    descr: ColumnDescPtr,
+
+    current_encoding: Option<Encoding>,
+
+    // Cache of decoders for existing encodings
+    decoders: HashMap<Encoding, Box<dyn Decoder<T>>>,
+}
+
+impl<T: DataType> ColumnValueDecoder for ColumnValueDecoderImpl<T> {
+    type Writer = [T::T];
+
+    fn create(descr: &ColumnDescPtr) -> Self {
+        Self {
+            descr: descr.clone(),
+            current_encoding: None,
+            decoders: Default::default(),
+        }
+    }
+
+    fn set_dict(
+        &mut self,
+        buf: ByteBufferPtr,
+        num_values: u32,
+        mut encoding: Encoding,
+        _is_sorted: bool,
+    ) -> Result<()> {
+        if encoding == Encoding::PLAIN || encoding == 
Encoding::PLAIN_DICTIONARY {
+            encoding = Encoding::RLE_DICTIONARY
+        }
+
+        if self.decoders.contains_key(&encoding) {
+            return Err(general_err!("Column cannot have more than one 
dictionary"));
+        }
+
+        if encoding == Encoding::RLE_DICTIONARY {
+            let mut dictionary = 
PlainDecoder::<T>::new(self.descr.type_length());
+            dictionary.set_data(buf, num_values as usize)?;
+
+            let mut decoder = DictDecoder::new();
+            decoder.set_dict(Box::new(dictionary))?;
+            self.decoders.insert(encoding, Box::new(decoder));
+            Ok(())
+        } else {
+            Err(nyi_err!(
+                "Invalid/Unsupported encoding type for dictionary: {}",
+                encoding
+            ))
+        }
+    }
+
+    fn set_data(
+        &mut self,
+        mut encoding: Encoding,
+        data: ByteBufferPtr,
+        num_values: usize,
+    ) -> Result<()> {
+        if encoding == Encoding::PLAIN_DICTIONARY {
+            encoding = Encoding::RLE_DICTIONARY;
+        }
+
+        let decoder = if encoding == Encoding::RLE_DICTIONARY {
+            self.decoders
+                .get_mut(&encoding)
+                .expect("Decoder for dict should have been set")
+        } else {
+            // Search cache for data page decoder
+            #[allow(clippy::map_entry)]
+            if !self.decoders.contains_key(&encoding) {
+                // Initialize decoder for this page
+                let data_decoder = get_decoder::<T>(self.descr.clone(), 
encoding)?;
+                self.decoders.insert(encoding, data_decoder);
+            }
+            self.decoders.get_mut(&encoding).unwrap()
+        };
+
+        decoder.set_data(data, num_values)?;
+        self.current_encoding = Some(encoding);
+        Ok(())
+    }
+
+    fn read(&mut self, out: &mut Self::Writer, range: Range<usize>) -> 
Result<usize> {
+        let encoding = self
+            .current_encoding
+            .expect("current_encoding should be set");
+
+        let current_decoder = self

Review comment:
       why not set a `current_decoder` field in the `set_data` method (where 
the decoder has to be selected anyway to call `set_data` on it), so that it 
doesn't have to be looked up on every call of `read` here?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-rs] yordan-pavlov commented on a change in pull request #1041: Generify ColumnReaderImpl and RecordReader (#1040)

Reply via email to