[GitHub] [arrow-rs] tustvold commented on a change in pull request #1041: Generify ColumnReaderImpl and RecordReader (#1040)

GitBox Sat, 01 Jan 2022 11:05:14 -0800


tustvold commented on a change in pull request #1041:
URL: https://github.com/apache/arrow-rs/pull/1041#discussion_r777130877




##########
File path: parquet/src/column/reader/decoder.rs
##########
@@ -0,0 +1,215 @@
+use std::collections::HashMap;
+use std::ops::Range;
+
+use crate::basic::Encoding;
+use crate::data_type::DataType;
+use crate::decoding::{get_decoder, Decoder, DictDecoder, PlainDecoder};
+use crate::encodings::rle::RleDecoder;
+use crate::errors::{ParquetError, Result};
+use crate::memory::ByteBufferPtr;
+use crate::schema::types::ColumnDescPtr;
+use crate::util::bit_util::BitReader;
+
+/// A type that can have level data written to it by a [`ColumnLevelDecoder`]
+pub trait LevelsWriter {
+    fn capacity(&self) -> usize;
+
+    fn get(&self, idx: usize) -> i16;
+
+    fn count_nulls(&self, range: Range<usize>, max_level: i16) -> usize;
+}
+
+impl LevelsWriter for [i16] {
+    fn capacity(&self) -> usize {
+        self.len()
+    }
+
+    fn get(&self, idx: usize) -> i16 {
+        self[idx]
+    }
+
+    fn count_nulls(&self, range: Range<usize>, max_level: i16) -> usize {
+        self[range].iter().filter(|i| **i != max_level).count()
+    }
+}
+
+/// A type that can have value data written to it by a [`ColumnValueDecoder`]
+pub trait ValuesWriter {
+    fn capacity(&self) -> usize;
+}
+
+impl<T> ValuesWriter for [T] {
+    fn capacity(&self) -> usize {
+        self.len()
+    }
+}
+
+/// Decodes level data to a [`LevelsWriter`]
+pub trait ColumnLevelDecoder {
+    type Writer: LevelsWriter + ?Sized;
+
+    fn create(max_level: i16, encoding: Encoding, data: ByteBufferPtr) -> Self;
+
+    fn read(&mut self, out: &mut Self::Writer, range: Range<usize>) -> 
Result<usize>;
+}
+
+/// Decodes value data to a [`ValuesWriter`]
+pub trait ColumnValueDecoder {
+    type Writer: ValuesWriter + ?Sized;
+
+    fn create(col: &ColumnDescPtr) -> Self;
+
+    fn set_dict(
+        &mut self,
+        buf: ByteBufferPtr,
+        num_values: u32,
+        encoding: Encoding,
+        is_sorted: bool,
+    ) -> Result<()>;
+
+    fn set_data(
+        &mut self,
+        encoding: Encoding,
+        data: ByteBufferPtr,
+        num_values: usize,
+    ) -> Result<()>;
+
+    fn read(&mut self, out: &mut Self::Writer, range: Range<usize>) -> 
Result<usize>;
+}
+
+/// An implementation of [`ColumnValueDecoder`] for `[T::T]`
+pub struct ColumnValueDecoderImpl<T: DataType> {
+    descr: ColumnDescPtr,
+
+    current_encoding: Option<Encoding>,
+
+    // Cache of decoders for existing encodings
+    decoders: HashMap<Encoding, Box<dyn Decoder<T>>>,
+}
+
+impl<T: DataType> ColumnValueDecoder for ColumnValueDecoderImpl<T> {
+    type Writer = [T::T];
+
+    fn create(descr: &ColumnDescPtr) -> Self {
+        Self {
+            descr: descr.clone(),
+            current_encoding: None,
+            decoders: Default::default(),
+        }
+    }
+
+    fn set_dict(
+        &mut self,
+        buf: ByteBufferPtr,
+        num_values: u32,
+        mut encoding: Encoding,
+        _is_sorted: bool,
+    ) -> Result<()> {
+        if encoding == Encoding::PLAIN || encoding == 
Encoding::PLAIN_DICTIONARY {
+            encoding = Encoding::RLE_DICTIONARY
+        }
+
+        if self.decoders.contains_key(&encoding) {
+            return Err(general_err!("Column cannot have more than one 
dictionary"));
+        }
+
+        if encoding == Encoding::RLE_DICTIONARY {
+            let mut dictionary = 
PlainDecoder::<T>::new(self.descr.type_length());
+            dictionary.set_data(buf, num_values as usize)?;
+
+            let mut decoder = DictDecoder::new();
+            decoder.set_dict(Box::new(dictionary))?;
+            self.decoders.insert(encoding, Box::new(decoder));
+            Ok(())
+        } else {
+            Err(nyi_err!(
+                "Invalid/Unsupported encoding type for dictionary: {}",
+                encoding
+            ))
+        }
+    }
+
+    fn set_data(
+        &mut self,
+        mut encoding: Encoding,
+        data: ByteBufferPtr,
+        num_values: usize,
+    ) -> Result<()> {
+        if encoding == Encoding::PLAIN_DICTIONARY {
+            encoding = Encoding::RLE_DICTIONARY;
+        }
+
+        let decoder = if encoding == Encoding::RLE_DICTIONARY {
+            self.decoders
+                .get_mut(&encoding)
+                .expect("Decoder for dict should have been set")
+        } else {
+            // Search cache for data page decoder
+            #[allow(clippy::map_entry)]
+            if !self.decoders.contains_key(&encoding) {
+                // Initialize decoder for this page
+                let data_decoder = get_decoder::<T>(self.descr.clone(), 
encoding)?;
+                self.decoders.insert(encoding, data_decoder);
+            }
+            self.decoders.get_mut(&encoding).unwrap()
+        };
+
+        decoder.set_data(data, num_values)?;
+        self.current_encoding = Some(encoding);
+        Ok(())
+    }
+
+    fn read(&mut self, out: &mut Self::Writer, range: Range<usize>) -> 
Result<usize> {
+        let encoding = self
+            .current_encoding
+            .expect("current_encoding should be set");
+
+        let current_decoder = self

Review comment:
       I didn't write this logic, just moved it, but my guess is this is a way 
to placate the borrow checker. `Decoder::get` requires a mutable reference, and 
we wish for decoders, in particular the dictionary decoder, to be usable across 
multiple `set_data` calls. 
   
   In order to have a current_decoder construct we would either need to perform 
a convoluted move dance moving data in and out of the decoder map, or use 
`Rc<RefCell>`. This is simpler, if possibly a little less performant. FWIW I'd 
wager that the overheads of a hashmap keyed on a low cardinality enumeration 
are pretty low.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-rs] tustvold commented on a change in pull request #1041: Generify ColumnReaderImpl and RecordReader (#1040)

Reply via email to