alamb commented on code in PR #10730:
URL: https://github.com/apache/datafusion/pull/10730#discussion_r1623278573


##########
datafusion/core/tests/parquet/arrow_statistics.rs:
##########
@@ -1168,7 +1165,7 @@ async fn test_struct() {
         reader: reader.build().await,
         expected_min: Arc::new(struct_array(vec![(Some(1), Some(6.0), 
Some(12.0))])),
         expected_max: Arc::new(struct_array(vec![(Some(2), Some(8.5), 
Some(14.0))])),
-        expected_null_counts: UInt64Array::from(vec![0]),
+        expected_null_counts: UInt64Array::from(vec![2]),

Review Comment:
   Something isn't right here -- the structs themselves aren't nullable (only 
some of their child fields)
   
   
https://github.com/apache/datafusion/blob/7c08a6fd00fcb352c11889ed62de7c9948978c79/datafusion/core/tests/parquet/mod.rs#L954-L958
   
   
https://github.com/apache/datafusion/blob/7c08a6fd00fcb352c11889ed62de7c9948978c79/datafusion/core/tests/parquet/mod.rs#L1024-L1045
   
   



##########
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs:
##########
@@ -216,6 +225,96 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = 
Option<&'a ParquetStatistics
     collect_scalars(data_type, scalars)
 }
 
+/// Extract the min statistics for struct array
+pub(crate) fn struct_min_statistics(
+    row_groups: &[RowGroupMetaData],
+    struct_fields: &Fields,
+) -> Result<ArrayRef, DataFusionError> {
+    let mut child_data = Vec::new();
+    let mut fields = Vec::new();
+
+    if struct_fields.iter().any(|f| f.data_type().is_nested()) {
+        return Ok(new_empty_array(&DataType::Struct(struct_fields.clone())));
+    }
+
+    for (idx, field) in struct_fields.iter().enumerate() {
+        // Handle non-nested fields
+        let max_value = row_groups
+            .iter()
+            .map(|x| x.column(idx).statistics())
+            .map(|x| {
+                x.and_then(|s| get_statistic!(s, min, min_bytes, 
Some(field.data_type())))
+            });
+        let array = collect_scalars(field.data_type(), max_value)?;
+        child_data.push(Arc::new(array) as Arc<dyn Array>);
+        fields.push(Arc::new(Field::new(
+            field.name(),
+            field.data_type().clone(),
+            field.is_nullable(),
+        )));
+    }
+    // Create a StructArray from collected fields and data
+    let struct_array =
+        
StructArray::from(fields.into_iter().zip(child_data).collect::<Vec<_>>());
+    println!("the struct array is {:?}", struct_array);
+    Ok(Arc::new(struct_array) as ArrayRef)
+}
+
+/// Extract the max statistics for struct array
+pub(crate) fn struct_max_statistics(
+    row_groups: &[RowGroupMetaData],
+    struct_fields: &Fields,
+) -> Result<ArrayRef, DataFusionError> {
+    let mut child_data = Vec::new();
+    let mut fields = Vec::new();
+
+    if struct_fields.iter().any(|f| f.data_type().is_nested()) {
+        return Ok(new_empty_array(&DataType::Struct(struct_fields.clone())));
+    }
+
+    for (idx, field) in struct_fields.iter().enumerate() {
+        // Handle non-nested fields
+        let max_value = row_groups
+            .iter()
+            .map(|x| x.column(idx).statistics())
+            .map(|x| {
+                x.and_then(|s| get_statistic!(s, max, max_bytes, 
Some(field.data_type())))
+            });
+        let array = collect_scalars(field.data_type(), max_value)?;
+        child_data.push(Arc::new(array) as Arc<dyn Array>);
+        fields.push(Arc::new(Field::new(
+            field.name(),
+            field.data_type().clone(),
+            field.is_nullable(),
+        )));
+    }
+    // Create a StructArray from collected fields and data
+    let struct_array =
+        
StructArray::from(fields.into_iter().zip(child_data).collect::<Vec<_>>());
+    Ok(Arc::new(struct_array) as ArrayRef)
+}
+
+/// Extract the nullcount statistics for struct array
+pub(crate) fn struct_null_count_statistics(
+    row_groups: &[RowGroupMetaData],
+    struct_fields: &Fields,
+) -> Result<ArrayRef, DataFusionError> {
+    if struct_fields.iter().any(|f| f.data_type().is_nested()) {
+        return Ok(Arc::new(new_empty_array(&DataType::UInt64)) as ArrayRef);
+    }
+
+    let mut null_count: u64 = 0;

Review Comment:
   I think nulls only have to check the current nulls (not their children's 
nulls)... 



##########
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs:
##########
@@ -190,20 +190,29 @@ pub(crate) fn parquet_column<'a>(
     name: &str,
 ) -> Option<(usize, &'a FieldRef)> {
     let (root_idx, field) = arrow_schema.fields.find(name)?;
-    if field.data_type().is_nested() {
-        // Nested fields are not supported and require non-trivial logic
-        // to correctly walk the parquet schema accounting for the
-        // logical type rules - 
<https://github.com/apache/parquet-format/blob/master/LogicalTypes.md>
-        //
-        // For example a ListArray could correspond to anything from 1 to 3 
levels
-        // in the parquet schema
-        return None;
-    }
+    match field.data_type() {
+        DataType::Struct(_) => {
+            let parquet_idx = (0..parquet_schema.columns().len())

Review Comment:
   @tustvold  could you help here -- is this the correct way to find the 
correct parquet leaf from an arrow struct type?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to