alamb commented on code in PR #10802: URL: https://github.com/apache/datafusion/pull/10802#discussion_r1627707292
########## datafusion-examples/examples/parquet_index.rs: ########## @@ -518,21 +518,17 @@ impl ParquetMetadataIndexBuilder { // extract the parquet statistics from the file's footer let metadata = reader.metadata(); + let row_groups = metadata.row_groups(); // Extract the min/max values for each row group from the statistics - let row_counts = StatisticsConverter::row_counts(reader.metadata())?; - let value_column_mins = StatisticsConverter::try_new( + let converter = StatisticsConverter::try_new( Review Comment: This is a pretty good example of how the statistics API changed. FYI @NGA-TRAN ########## datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs: ########## @@ -136,32 +133,35 @@ impl RowGroupSet { metrics: &ParquetFileMetrics, ) { assert_eq!(groups.len(), self.len()); - for (idx, metadata) in groups.iter().enumerate() { - if !self.should_scan(idx) { - continue; - } - let pruning_stats = RowGroupPruningStatistics { - parquet_schema, - row_group_metadata: metadata, - arrow_schema, - }; - match predicate.prune(&pruning_stats) { - Ok(values) => { - // NB: false means don't scan row group - if !values[0] { + // Indexes of row groups still to scan Review Comment: Here is the change to prune all row groups with one call to `PruningPredicate::prune` rather than one call per row group -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org