alamb commented on code in PR #10924: URL: https://github.com/apache/datafusion/pull/10924#discussion_r1640395427
########## datafusion/core/src/datasource/file_format/parquet.rs: ########## @@ -482,73 +404,101 @@ pub async fn statistics_from_parquet_meta( file_metadata.key_value_metadata(), )?; - let num_fields = table_schema.fields().len(); - let fields = table_schema.fields(); - let mut num_rows = 0; let mut total_byte_size = 0; - let mut null_counts = vec![Precision::Exact(0); num_fields]; - let mut has_statistics = false; - - let schema_adapter = - DefaultSchemaAdapterFactory::default().create(table_schema.clone()); - - let (mut max_values, mut min_values) = create_max_min_accs(&table_schema); for row_group_meta in metadata.row_groups() { num_rows += row_group_meta.num_rows(); total_byte_size += row_group_meta.total_byte_size(); + } - let mut column_stats: HashMap<usize, (u64, &ParquetStatistics)> = HashMap::new(); + let schema_adapter = + DefaultSchemaAdapterFactory::default().create(table_schema.clone()); - for (i, column) in row_group_meta.columns().iter().enumerate() { - if let Some(stat) = column.statistics() { - has_statistics = true; - column_stats.insert(i, (stat.null_count(), stat)); - } - } + // statistics for each of the table's columns (may be different from the + // file schema) + let mut column_statistics = vec![]; + + for (table_idx, field) in table_schema.fields().iter().enumerate() { + let Some(file_idx) = schema_adapter.map_column_index(table_idx, &file_schema) + else { + // file columns not in table schema are treated as all null + let null_count = Precision::Exact(num_rows as usize); + let null_value = ScalarValue::try_from(field.data_type())?; + let stats = ColumnStatistics::new_unknown() + .with_null_count(null_count) + .with_max_value(Precision::Exact(null_value.clone())) + .with_min_value(Precision::Exact(null_value)); + column_statistics.push(stats); + continue; + }; - if has_statistics { - for (table_idx, null_cnt) in null_counts.iter_mut().enumerate() { - if let Some(file_idx) = - schema_adapter.map_column_index(table_idx, &file_schema) - { - if let Some((null_count, stats)) = column_stats.get(&file_idx) { - *null_cnt = null_cnt.add(&Precision::Exact(*null_count as usize)); - summarize_min_max( - &mut max_values, - &mut min_values, - fields, - table_idx, - stats, - ) - } else { - // If none statistics of current column exists, set the Max/Min Accumulator to None. - max_values[table_idx] = None; - min_values[table_idx] = None; - } - } else { - *null_cnt = null_cnt.add(&Precision::Exact(num_rows as usize)); - } - } - } - } + let file_field = file_schema.field(file_idx); + let Some(converter) = StatisticsConverter::try_new( Review Comment: this code now uses the well tested StatisticsConverter to extract statistics from the parquet file with the correct type of array in a single call -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org