alamb commented on code in PR #8797:
URL: https://github.com/apache/arrow-rs/pull/8797#discussion_r2538004656


##########
parquet/src/file/metadata/options.rs:
##########
@@ -29,6 +74,11 @@ use crate::schema::types::SchemaDescPtr;
 #[derive(Default, Debug, Clone)]
 pub struct ParquetMetaDataOptions {
     schema_descr: Option<SchemaDescPtr>,
+    encoding_stats_as_mask: bool,
+    // The outer option acts as a global boolean, so if 
`skip_encoding_stats.is_some()`
+    // is `true` then we're at least skipping some stats. The inner `Option` 
is a keep
+    // list of column indices to decode.
+    skip_encoding_stats: Option<Option<Arc<HashSet<usize>>>>,

Review Comment:
   
   Would it make sense to update this API to also use `ParquetStatisticsPolicy`?
   
   ```suggestion
       skip_encoding_stats: ParquetStatisticsPolicy,
   ```



##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -557,6 +557,29 @@ impl ArrowReaderOptions {
         self
     }
 
+    /// Set whether to convert the [`encoding_stats`] in the Parquet 
`ColumnMetaData` to a bitmask.

Review Comment:
   ```suggestion
       /// Set whether to convert the [`encoding_stats`] in the Parquet 
`ColumnMetaData` to a bitmask (defaults to false)
   ```



##########
parquet/src/file/metadata/options.rs:
##########
@@ -17,12 +17,53 @@
 
 //! Options used to control metadata parsing
 
-use paste::paste;
 use std::collections::HashSet;
 use std::sync::Arc;
 
 use crate::schema::types::SchemaDescPtr;
 
+/// Enum to control decoding of some Parquet statistics fields.

Review Comment:
   yes, i agree this is much better



##########
parquet/src/file/metadata/options.rs:
##########
@@ -17,8 +17,53 @@
 
 //! Options used to control metadata parsing
 
+use std::collections::HashSet;
+use std::sync::Arc;
+
 use crate::schema::types::SchemaDescPtr;
 
+/// Enum to control decoding of some Parquet statistics fields.
+///
+/// # Example
+/// ```rust
+/// use parquet::file::metadata::ParquetStatisticsPolicy;
+/// use parquet::file::serialized_reader::ReadOptionsBuilder;
+/// use parquet::arrow::arrow_reader::ArrowReaderOptions;
+///
+/// // Set arrow options to skip encoding statistics for all columns.
+/// let options =
+///     
ArrowReaderOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll);
+///
+/// // Set serialized reader options to decode encoding statistics for all 
columns.
+/// let options =
+///     
ReadOptionsBuilder::new().with_encoding_stats_policy(ParquetStatisticsPolicy::KeepAll)
+///     .build();
+///
+/// // Set arrow options to skip encoding statistics for all columns, but to 
decode statistics
+/// // for columns 0 and 1.
+/// let options = ArrowReaderOptions::new()
+///     .with_encoding_stats_policy(ParquetStatisticsPolicy::skip_except(&[0, 
1]));
+/// ```
+#[derive(Debug, Clone)]
+pub enum ParquetStatisticsPolicy {
+    /// Decode the relevant statistics for all columns.
+    KeepAll,
+    /// Skip decoding the relevant statistics for all columns.
+    SkipAll,
+    /// Skip decoding the relevant statistics for all columns not in the 
provided set
+    /// of column indices.
+    SkipExcept(HashSet<usize>),
+}
+
+impl ParquetStatisticsPolicy {
+    /// Create a `ParquetStatisticsPolicy` to skip all columns except those in 
`keep`.
+    pub fn skip_except(keep: &[usize]) -> Self {

Review Comment:
   This is a nice API



##########
parquet/src/file/metadata/options.rs:
##########
@@ -48,9 +98,79 @@ impl ParquetMetaDataOptions {
         self.schema_descr = Some(val);
     }
 
-    /// Provide a schema to use when decoding the metadata. Returns `Self` for 
chaining.
+    /// Call [`Self::set_schema`] and return `Self` for chaining.
     pub fn with_schema(mut self, val: SchemaDescPtr) -> Self {
-        self.schema_descr = Some(val);
+        self.set_schema(val);
+        self
+    }
+
+    /// Returns whether to present the [`encoding_stats`] field of the Parquet 
`ColumnMetaData`
+    /// as a bitmask.

Review Comment:
   ```suggestion
       /// as a bitmask (defaults to false).
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to