alamb commented on code in PR #7360: URL: https://github.com/apache/arrow-rs/pull/7360#discussion_r2021149241
########## parquet/src/arrow/arrow_reader/mod.rs: ########## @@ -646,6 +716,125 @@ impl<T: ChunkReader + 'static> ParquetRecordBatchReaderBuilder<T> { apply_range(selection, reader.num_rows(), self.offset, self.limit), )) } + + /// Apply predicate pushdowns to filter out row groups that can't match + fn apply_predicate_pushdowns( + &self, + row_groups: &[usize], + pushdowns: &PredicatePushdowns, + ) -> Result<Vec<usize>> { + // This implementation checks if row groups can be skipped based on statistics + let mut filtered_row_groups = Vec::new(); + + for &rg_idx in row_groups { + let rg = self.metadata.row_group(rg_idx); + let mut include_rg = true; + + // Check each predicate against this row group + for predicate in pushdowns.predicates() { + // Find the column in the row group + let col_idx = self.find_column_index(predicate.column())?; + if col_idx.is_none() { + // Column not found, can't apply this predicate + continue; + } + + let col_idx = col_idx.unwrap(); + let column_chunk = rg.column(col_idx); + + // Check if the column chunk has statistics + if let Some(stats) = column_chunk.statistics() { + // Get the Arrow data type for this column + let arrow_schema = self.schema.as_ref(); + let arrow_field = match arrow_schema.field_with_name(predicate.column()) { + Ok(field) => field, + Err(_) => continue, // Field not found in Arrow schema + }; + let data_type = arrow_field.data_type(); + + // Convert Parquet statistics to Arrow arrays + let min_array = match convert_stat_to_array(stats, data_type, true) { + Ok(arr) => arr, + Err(_) => continue, // Can't convert statistics + }; + + let max_array = match convert_stat_to_array(stats, data_type, false) { + Ok(arr) => arr, + Err(_) => continue, // Can't convert statistics + }; + + // Check if we can skip this row group based on statistics + if ! predicate.can_use_chunk(min_array.as_ref(), max_array.as_ref()) { + include_rg = false; + break; // No need to check other predicates if we're already skipping + } + } + } + + if include_rg { + filtered_row_groups.push(rg_idx); + } + } + + Ok(filtered_row_groups) + } + + /// Find the index of a column by name + fn find_column_index(&self, column_name: &str) -> Result<Option<usize>> { + let schema = self.metadata.file_metadata().schema_descr(); + for i in 0..schema.num_columns() { + let column = schema.column(i); + let col_path = column.path(); + if col_path.string() == column_name { + return Ok(Some(i)); + } + } + Ok(None) + } +} + +/// Convert a Parquet statistic to an Arrow array Review Comment: I think this is largely duplicated with https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/statistics/struct.StatisticsConverter.html -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org