xinlifoobar commented on code in PR #10802: URL: https://github.com/apache/datafusion/pull/10802#discussion_r1627827138
########## datafusion/core/src/datasource/physical_plan/parquet/statistics.rs: ########## @@ -404,67 +410,136 @@ impl<'a> StatisticsConverter<'a> { column_name ); }; - Ok(Self { - column_name, - statistics_type, + + // find the column in the parquet schema, if not, return a null array + let parquet_index = match parquet_column( + parquet_schema, arrow_schema, + column_name, + ) { + Some((parquet_idx, matched_field)) => { + // sanity check that matching field matches the arrow field + if matched_field.as_ref() != arrow_field { + return internal_err!( + "Matched column '{:?}' does not match original matched column '{:?}'", + matched_field, + arrow_field + ); + } + Some(parquet_idx) + } + None => None, + }; + + Ok(Self { + parquet_index, arrow_field, }) } - /// extract the statistics from a parquet file, given the parquet file's metadata + /// Extract the minimum values from row group statistics in [`RowGroupMetaData`] + /// + /// # Return Value /// - /// The returned array contains 1 value for each row group in the parquet - /// file in order + /// The returned array contains 1 value for each row group, in the same order as `metadatas` /// /// Each value is either - /// * the requested statistics type for the column + /// * the minimum value for the column /// * a null value, if the statistics can not be extracted /// - /// Note that a null value does NOT mean the min or max value was actually + /// Note that a null value does NOT mean the min value was actually /// `null` it means it the requested statistic is unknown /// + /// # Errors + /// /// Reasons for not being able to extract the statistics include: /// * the column is not present in the parquet file /// * statistics for the column are not present in the row group /// * the stored statistic value can not be converted to the requested type - pub fn extract(&self, metadata: &ParquetMetaData) -> Result<ArrayRef> { + /// + /// # Example + /// ```no_run + /// # use arrow::datatypes::Schema; + /// # use arrow_array::ArrayRef; + /// # use parquet::file::metadata::ParquetMetaData; + /// # use datafusion::datasource::physical_plan::parquet::StatisticsConverter; + /// # fn get_parquet_metadata() -> ParquetMetaData { unimplemented!() } + /// # fn get_arrow_schema() -> Schema { unimplemented!() } + /// // Given the metadata for a parquet file and the arrow schema + /// let metadata: ParquetMetaData = get_parquet_metadata(); + /// let arrow_schema: Schema = get_arrow_schema(); + /// let parquet_schema = metadata.file_metadata().schema_descr(); + /// // create a converter + /// let converter = StatisticsConverter::try_new("foo", &arrow_schema, parquet_schema) + /// .unwrap(); + /// // get the minimum value for the column "foo" in the parquet file + /// let min_values: ArrayRef = converter + /// .row_group_mins(metadata.row_groups().iter()) + /// .unwrap(); + /// ``` + pub fn row_group_mins<I>(&self, metadatas: I) -> Result<ArrayRef> + where + I: IntoIterator<Item = &'a RowGroupMetaData>, + { let data_type = self.arrow_field.data_type(); - let num_row_groups = metadata.row_groups().len(); - let parquet_schema = metadata.file_metadata().schema_descr(); - let row_groups = metadata.row_groups(); + let Some(parquet_index) = self.parquet_index else { + return Ok(self.make_null_array(data_type, metadatas)); + }; - // find the column in the parquet schema, if not, return a null array - let Some((parquet_idx, matched_field)) = - parquet_column(parquet_schema, self.arrow_schema, self.column_name) - else { - // column was in the arrow schema but not in the parquet schema, so return a null array - return Ok(new_null_array(data_type, num_row_groups)); + let iter = metadatas + .into_iter() + .map(|x| x.column(parquet_index).statistics()); + min_statistics(data_type, iter) + } + + /// Extract the maximum values from row group statistics in [`RowGroupMetaData`] + /// + /// See docs on [`Self::row_group_mins`] for details + pub fn row_group_maxes<I>(&self, metadatas: I) -> Result<ArrayRef> + where + I: IntoIterator<Item = &'a RowGroupMetaData>, + { + let data_type = self.arrow_field.data_type(); + + let Some(parquet_index) = self.parquet_index else { + return Ok(self.make_null_array(data_type, metadatas)); }; - // sanity check that matching field matches the arrow field - if matched_field.as_ref() != self.arrow_field { - return internal_err!( - "Matched column '{:?}' does not match original matched column '{:?}'", - matched_field, - self.arrow_field - ); - } + let iter = metadatas + .into_iter() + .map(|x| x.column(parquet_index).statistics()); + max_statistics(data_type, iter) Review Comment: The `min_statistcs` and `max_statistics` changes in another PR could still be used here... -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org