etseidl commented on code in PR #8587:
URL: https://github.com/apache/arrow-rs/pull/8587#discussion_r2423090284


##########
parquet/src/basic.rs:
##########
@@ -724,6 +725,123 @@ impl FromStr for Encoding {
     }
 }
 
+/// A bitmask representing the [`Encoding`]s employed while encoding a Parquet 
column chunk.
+///
+/// The Parquet [`ColumnMetaData`] struct contains an array that indicates 
what encodings were
+/// used when writing that column chunk. For memory and performance reasons, 
this crate reduces
+/// that array to bitmask, where each bit position represents a different 
[`Encoding`]. This
+/// struct contains that bitmask, and provides methods to interact with the 
data.
+///
+/// # Example
+/// ```no_run
+/// # use parquet::file::metadata::ParquetMetaDataReader;
+/// # use parquet::basic::Encoding;
+/// # fn open_parquet_file(path: &str) -> std::fs::File { unimplemented!(); }
+/// // read parquet metadata from a file
+/// let file = open_parquet_file("some_path.parquet");
+/// let mut reader = ParquetMetaDataReader::new();
+/// reader.try_parse(&file).unwrap();
+/// let metadata = reader.finish().unwrap();
+///
+/// // find the encodings used by the first column chunk in the first row group
+/// let col_meta = metadata.row_group(0).column(0);
+/// let encodings = col_meta.encodings_mask();
+///
+/// // check to see if a particular encoding was used
+/// let used_rle = encodings.is_set(Encoding::RLE);
+///
+/// // check to see if all of a set of encodings were used
+/// let used_all = encodings.all_set([Encoding::RLE, Encoding::PLAIN].iter());
+///
+/// // convert mask to a Vec<Encoding>
+/// let encodings_vec = encodings.encodings().collect::<Vec<_>>();
+/// ```
+///
+/// [`ColumnMetaData`]: 
https://github.com/apache/parquet-format/blob/9fd57b59e0ce1a82a69237dcf8977d3e72a2965d/src/main/thrift/parquet.thrift#L875
+#[derive(Debug, Clone, Default, PartialEq, Eq)]
+pub struct EncodingMask(i32);
+
+impl EncodingMask {
+    const MAX_ENCODING: i32 = Encoding::BYTE_STREAM_SPLIT as i32;
+
+    /// Create a new `EncodingMask` from an integer.
+    pub fn new(val: i32) -> Self {
+        Self(val)
+    }
+
+    /// Return an integer representation of this `EncodingMask`.
+    pub fn as_i32(&self) -> i32 {
+        self.0
+    }
+
+    /// Create a new `EncodingMask` from a collection of [`Encoding`]s.
+    pub fn new_from_encodings<'a>(encodings: impl Iterator<Item = &'a 
Encoding>) -> Self {
+        let mut mask = 0;
+        for &e in encodings {
+            mask |= 1 << (e as i32);
+        }
+        Self(mask)
+    }
+
+    /// Test if a given [`Encoding`] is present in this mask.
+    pub fn is_set(&self, val: Encoding) -> bool {
+        self.0 & (1 << (val as i32)) != 0
+    }
+
+    /// Test if all [`Encoding`]s in a given set are present in this mask.
+    pub fn all_set<'a>(&self, mut encodings: impl Iterator<Item = &'a 
Encoding>) -> bool {
+        encodings.all(|&e| self.is_set(e))
+    }
+
+    /// Return an iterator over all [`Encoding`]s present in this mask.
+    pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
+        Self::mask_to_encodings_iter(self.0)
+    }
+
+    fn mask_to_encodings_iter(mask: i32) -> impl Iterator<Item = Encoding> {
+        (0..=Self::MAX_ENCODING)
+            .filter(move |i| mask & (1 << i) != 0)
+            .map(i32_to_encoding)
+    }
+}
+
+impl HeapSize for EncodingMask {
+    fn heap_size(&self) -> usize {
+        0 // no heap allocations
+    }
+}
+
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for EncodingMask 
{
+    fn read_thrift(prot: &mut R) -> Result<Self> {
+        let mut mask = 0;
+
+        let list_ident = prot.read_list_begin()?;
+        for _ in 0..list_ident.size {
+            let val = i32::read_thrift(prot)?;
+            if (0..=Self::MAX_ENCODING).contains(&val) {

Review Comment:
   Oh, and I was wondering if `i32` was too large, but we're already up to 10 
bits with the current encodings, and with @alamb starting the discussion about 
adding more, I think we're likely to exhaust an `i16` in the not-to-distant 
future.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to