yordan-pavlov commented on a change in pull request #1389:
URL: https://github.com/apache/arrow-rs/pull/1389#discussion_r819869426
##########
File path: parquet/src/file/serialized_reader.rs
##########
@@ -138,25 +188,38 @@ impl<R: 'static + ChunkReader> SerializedFileReader<R> {
})
}
- /// Filters row group metadata to only those row groups,
- /// for which the predicate function returns true
- pub fn filter_row_groups(
- &mut self,
- predicate: &dyn Fn(&RowGroupMetaData, usize) -> bool,
- ) {
- let mut filtered_row_groups = Vec::<RowGroupMetaData>::new();
- for (i, row_group_metadata) in
self.metadata.row_groups().iter().enumerate() {
- if predicate(row_group_metadata, i) {
- filtered_row_groups.push(row_group_metadata.clone());
- }
+ /// Creates file reader from a Parquet file with read options.
+ /// Returns error if Parquet file does not exist or is corrupt.
+ pub fn new_with_options(chunk_reader: R, options: ReadOptions) ->
Result<Self> {
+ let metadata = footer::parse_metadata(&chunk_reader)?;
+ let mut row_groups = metadata.row_groups().to_vec();
+ for mut predicate in options.predicate {
+ row_groups = row_groups
+ .into_iter()
+ .enumerate()
+ .filter(|(i, rg_meta)| predicate(rg_meta, *i))
+ .map(|(_, rg_meta)| rg_meta)
+ .collect::<Vec<_>>();
Review comment:
is it necessary to `collect` here for every predicate iteration?
wouldn't it be more efficient to only collect a single time once all predicates
have been applied?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]