Lordworms commented on code in PR #11289: URL: https://github.com/apache/datafusion/pull/11289#discussion_r1676059795
########## datafusion/core/tests/parquet/arrow_statistics.rs: ########## @@ -1984,7 +1981,96 @@ async fn test_struct() { } .run(); } +// test nested struct +#[tokio::test] +async fn test_nested_struct() { + // This creates a parquet file with 1 column named "nested_struct" + // The file is created by 1 record batch with 3 rows in the nested struct array + let reader = TestReader { + scenario: Scenario::StructArrayNested, + row_per_group: 5, + } + .build() + .await; + // Expected minimum and maximum values for nested struct fields + let inner_min = StructArray::from(vec![ + ( + Arc::new(Field::new("b", DataType::Boolean, false)), + Arc::new(BooleanArray::from(vec![Some(false)])) as ArrayRef, + ), + ( + Arc::new(Field::new("c", DataType::Int32, false)), + Arc::new(Int32Array::from(vec![Some(42)])) as ArrayRef, + ), + ]); + let inner_max = StructArray::from(vec![ + ( + Arc::new(Field::new("b", DataType::Boolean, false)), + Arc::new(BooleanArray::from(vec![Some(true)])) as ArrayRef, + ), + ( + Arc::new(Field::new("c", DataType::Int32, false)), + Arc::new(Int32Array::from(vec![Some(44)])) as ArrayRef, + ), + ]); + + let inner_fields = Fields::from(vec![ + Field::new("b", DataType::Boolean, false), + Field::new("c", DataType::Int32, false), + ]); + + // Expected minimum outer struct + let expected_min_outer = StructArray::from(vec![ + ( + Arc::new(Field::new( + "inner_struct", + DataType::Struct(inner_fields.clone()), + false, + )), + Arc::new(inner_min) as ArrayRef, + ), + ( + Arc::new(Field::new("outer_float", DataType::Float64, false)), + Arc::new(Float64Array::from(vec![Some(5.0)])) as ArrayRef, + ), + ( + Arc::new(Field::new("outer_boolean", DataType::Boolean, false)), + Arc::new(BooleanArray::from(vec![Some(false)])) as ArrayRef, + ), + ]); + + // Expected maximum outer struct + let expected_max_outer = StructArray::from(vec![ + ( + Arc::new(Field::new( + "inner_struct", + DataType::Struct(inner_fields), + false, + )), + Arc::new(inner_max) as ArrayRef, + ), + ( + Arc::new(Field::new("outer_float", DataType::Float64, false)), + Arc::new(Float64Array::from(vec![Some(7.0)])) as ArrayRef, + ), + ( + Arc::new(Field::new("outer_boolean", DataType::Boolean, false)), + Arc::new(BooleanArray::from(vec![Some(true)])) as ArrayRef, + ), + ]); + Review Comment: That's what I concern about. Since it is hard and seems meaningless to have a top-level nullcount. The reason I leave it like this is just to follow the test pattern. ########## datafusion/core/src/datasource/physical_plan/parquet/statistics.rs: ########## @@ -946,13 +946,29 @@ pub(crate) fn parquet_column<'a>( ) -> Option<(usize, &'a FieldRef)> { let (root_idx, field) = arrow_schema.fields.find(name)?; if field.data_type().is_nested() { - // Nested fields are not supported and require non-trivial logic - // to correctly walk the parquet schema accounting for the - // logical type rules - <https://github.com/apache/parquet-format/blob/master/LogicalTypes.md> - // - // For example a ListArray could correspond to anything from 1 to 3 levels - // in the parquet schema - return None; + match field.data_type() { Review Comment: Sure, I can do it -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org