efredine commented on code in PR #11289: URL: https://github.com/apache/datafusion/pull/11289#discussion_r1676961201
########## datafusion/core/src/datasource/physical_plan/parquet/statistics.rs: ########## @@ -1234,7 +1250,87 @@ impl<'a> StatisticsConverter<'a> { arrow_field, }) } + /// recursively get the corresponding statistics for all the column data, used for + /// DataType::Struct + pub(crate) fn get_statistics_min_max_recursive( + metadata: &[&RowGroupMetaData], + index: &mut usize, + is_min: bool, + data_type: &DataType, + ) -> Result<ArrayRef> { + match data_type.is_nested() { + false => { + let iterator = metadata.iter().map(|meta| { + let stat = meta.column(*index).statistics(); + stat + }); + let stat = if is_min { + min_statistics(data_type, iterator) + } else { + max_statistics(data_type, iterator) + }; + *index += 1; + stat + } + true => { + if let DataType::Struct(fields) = data_type { + let field_arrays: Vec<_> = fields + .iter() + .map(|field| { + let array = Self::get_statistics_min_max_recursive( + metadata, + index, + is_min, + field.data_type(), + )?; + Ok((field.clone(), array)) + }) + .collect::<Result<Vec<_>>>()?; + Ok(Arc::new(StructArray::from(field_arrays)) as ArrayRef) + } else { + plan_err!("unsupported nested data type for extracting statistics") + } + } + } + } + /// recursively get the corresponding statistics for all the column data, used for + /// DataType::Struct + pub(crate) fn get_null_counts_recursive( + metadata: &[&RowGroupMetaData], + index: usize, + data_type: &DataType, + ) -> Vec<u64> { Review Comment: If you preferred, this could also be expressed as a fold: ```Rust let num_row_groups = metadata.len(); fields.iter().fold(vec![0; num_row_groups], |mut acc, field| { let field_null_counts = Self::get_null_counts_recursive( metadata, index + 1, field.data_type(), ); acc.iter_mut().zip(field_null_counts.iter()).for_each(|(a, b)| *a += b); acc }) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org