etseidl commented on code in PR #8797:
URL: https://github.com/apache/arrow-rs/pull/8797#discussion_r2524008847
##########
parquet/src/file/metadata/options.rs:
##########
@@ -48,11 +70,70 @@ impl ParquetMetaDataOptions {
self.schema_descr = Some(val);
}
- /// Provide a schema to use when decoding the metadata. Returns `Self` for
chaining.
- pub fn with_schema(mut self, val: SchemaDescPtr) -> Self {
- self.schema_descr = Some(val);
- self
+ // with_schema
+ add_mutator!(schema, SchemaDescPtr);
+
+ /// Returns whether to present the `encoding_stats` field of the
`ColumnMetaData` as a
+ /// bitmask.
+ ///
+ /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an
explanation of why this
+ /// might be desirable.
+ ///
+ /// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
+ /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
+ pub fn encoding_stats_as_mask(&self) -> bool {
+ self.encoding_stats_as_mask
+ }
+
+ /// Convert `encoding_stats` from a vector of [`PageEncodingStats`] to a
bitmask. This can
+ /// speed up metadata decoding while still enabling some use cases served
by the full stats.
+ ///
+ /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for more
information.
+ ///
+ /// [`PageEncodingStats`]: crate::file::metadata::PageEncodingStats
+ /// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
+ /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
+ pub fn set_encoding_stats_as_mask(&mut self, val: bool) {
+ self.encoding_stats_as_mask = val;
}
+
+ // with_encoding_stats_as_mask
+ add_mutator!(encoding_stats_as_mask, bool);
Review Comment:
FWIW I've tried implementing a macro to generate all of the setters/getters.
<details>
```rust
// For the field "name", generate setters and getters for the field. This
generates:
// skip_name
// set_skip_name
// with_skip_name
// set_keep_name
// with_keep_name
// This assumes a field "skip_name" exists in the ParquetMetaDataOptions
struct.
macro_rules! add_bool_accessors {
($name:expr) => {
paste! {
#[doc = concat!("Returns whether to skip decoding the `",
stringify!($name), "`")]
#[doc = "in the Parquet [`ColumnMetaData`] for the column
indexed by `col_index`."]
#[doc = ""]
#[doc = "[`ColumnMetaData`]:
https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L875"]
pub fn [<skip_ $name>](&self, col_index: usize) -> bool {
self.[<skip_ $name>]
.as_ref()
.is_some_and(|oset| oset.as_ref().is_none_or(|keep|
!keep.contains(&col_index)))
}
...
}
}
}
pub struct ParquetMetaDataOptions {
...
skip_encoding_stats: Option<Option<Arc<HashSet<usize>>>>,
skip_statistics: Option<Option<Arc<HashSet<usize>>>>,
skip_size_statistics: Option<Option<Arc<HashSet<usize>>>>,
skip_geospatial_statistics: Option<Option<Arc<HashSet<usize>>>>,
}
impl ParquetMetaDataOptions {
...
add_bool_accessors!(encoding_stats);
add_bool_accessors!(statistics);
add_bool_accessors!(size_statistics);
add_bool_accessors!(geospatial_statistics);
}
```
</details>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]