etseidl commented on code in PR #8797:
URL: https://github.com/apache/arrow-rs/pull/8797#discussion_r2538938642
##########
parquet/src/file/metadata/options.rs:
##########
@@ -48,9 +98,79 @@ impl ParquetMetaDataOptions {
self.schema_descr = Some(val);
}
- /// Provide a schema to use when decoding the metadata. Returns `Self` for
chaining.
+ /// Call [`Self::set_schema`] and return `Self` for chaining.
pub fn with_schema(mut self, val: SchemaDescPtr) -> Self {
- self.schema_descr = Some(val);
+ self.set_schema(val);
+ self
+ }
+
+ /// Returns whether to present the [`encoding_stats`] field of the Parquet
`ColumnMetaData`
+ /// as a bitmask.
Review Comment:
Done in
https://github.com/apache/arrow-rs/pull/8797/commits/96753792588c4093f71ffe184ff3ad7acf8f3535
##########
parquet/src/file/metadata/options.rs:
##########
@@ -17,8 +17,53 @@
//! Options used to control metadata parsing
+use std::collections::HashSet;
+use std::sync::Arc;
+
use crate::schema::types::SchemaDescPtr;
+/// Enum to control decoding of some Parquet statistics fields.
+///
+/// # Example
+/// ```rust
+/// use parquet::file::metadata::ParquetStatisticsPolicy;
+/// use parquet::file::serialized_reader::ReadOptionsBuilder;
+/// use parquet::arrow::arrow_reader::ArrowReaderOptions;
+///
+/// // Set arrow options to skip encoding statistics for all columns.
+/// let options =
+///
ArrowReaderOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll);
+///
+/// // Set serialized reader options to decode encoding statistics for all
columns.
+/// let options =
+///
ReadOptionsBuilder::new().with_encoding_stats_policy(ParquetStatisticsPolicy::KeepAll)
+/// .build();
+///
+/// // Set arrow options to skip encoding statistics for all columns, but to
decode statistics
+/// // for columns 0 and 1.
+/// let options = ArrowReaderOptions::new()
+/// .with_encoding_stats_policy(ParquetStatisticsPolicy::skip_except(&[0,
1]));
+/// ```
+#[derive(Debug, Clone)]
+pub enum ParquetStatisticsPolicy {
+ /// Decode the relevant statistics for all columns.
+ KeepAll,
+ /// Skip decoding the relevant statistics for all columns.
+ SkipAll,
+ /// Skip decoding the relevant statistics for all columns not in the
provided set
+ /// of column indices.
+ SkipExcept(HashSet<usize>),
+}
+
+impl ParquetStatisticsPolicy {
+ /// Create a `ParquetStatisticsPolicy` to skip all columns except those in
`keep`.
+ pub fn skip_except(keep: &[usize]) -> Self {
Review Comment:
Thanks for the help and patience 🙏. I've been wanting to get this right
before forging ahead with other options.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]