alamb commented on code in PR #8866:
URL: https://github.com/apache/arrow-rs/pull/8866#discussion_r2539185277
##########
parquet/src/arrow/async_reader/mod.rs:
##########
@@ -2247,4 +2247,66 @@ mod tests {
},
);
}
+
+ #[tokio::test]
+ async fn test_nested_lists() -> Result<()> {
Review Comment:
This is directly adapted from the reproducer provided by @lewiszlw on
- https://github.com/apache/arrow-rs/issues/8657
##########
parquet/src/arrow/mod.rs:
##########
@@ -590,7 +635,8 @@ mod test {
#[test]
fn test_mask_from_column_names() {
- let message_type = "
+ let schema = parse_schema(
Review Comment:
I just reduced some duplication
##########
parquet/src/arrow/mod.rs:
##########
@@ -419,6 +419,51 @@ impl ProjectionMask {
}
}
}
+
+ /// Return a new [`ProjectionMask`] that excludes any leaf columns that are
+ /// part of a nested type, such as struct, list, or map
+ ///
+ /// If there are no non-nested columns in the mask, returns `None`
+ pub(crate) fn without_nested_types(&self, schema: &SchemaDescriptor) ->
Option<Self> {
+ let num_leaves = schema.num_columns();
+
+ // Count how many leaves each root column has
+ let num_roots = schema.root_schema().get_fields().len();
+ let mut root_leaf_counts = vec![0usize; num_roots];
+ for leaf_idx in 0..num_leaves {
+ let root_idx = schema.get_column_root_idx(leaf_idx);
+ root_leaf_counts[root_idx] += 1;
+ }
+
+ // Keep only leaves whose root has exactly one leaf (non-nested) and
is not a
+ // LIST. LIST is encoded as a wrapped logical type with a single leaf,
e.g.
+ //
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
+ //
+ // ```text
+ // // List<String> (list non-null, elements nullable)
+ // required group my_list (LIST) {
+ // repeated group list {
+ // optional binary element (STRING);
+ // }
+ // }
+ // ```
+ let mut included_leaves = Vec::new();
+ for leaf_idx in 0..num_leaves {
+ if self.leaf_included(leaf_idx) {
+ let root = schema.get_column_root(leaf_idx);
+ let root_idx = schema.get_column_root_idx(leaf_idx);
+ if root_leaf_counts[root_idx] == 1 && !root.is_list() {
Review Comment:
The only code difference here is to add the check for `!root.is_list()`
##########
parquet/src/arrow/mod.rs:
##########
@@ -766,4 +811,226 @@ mod test {
mask1.intersect(&mask2);
assert_eq!(mask1.mask, None);
}
+
+ #[test]
+ fn test_projection_mask_without_nested_no_nested() {
+ // Schema with no nested types
+ let schema = parse_schema(
+ "
+ message test_schema {
+ OPTIONAL INT32 a;
+ OPTIONAL INT32 b;
+ REQUIRED DOUBLE d;
+ }
+ ",
+ );
+
+ let mask = ProjectionMask::all();
+ // All columns are non-nested, but without_nested_types returns a new
mask
+ assert_eq!(
+ Some(ProjectionMask::leaves(&schema, [0, 1, 2])),
+ mask.without_nested_types(&schema)
+ );
+
+ // select b, c
+ let mask = ProjectionMask::leaves(&schema, [1, 2]);
+ assert_eq!(Some(mask.clone()), mask.without_nested_types(&schema));
+ }
+
+ #[test]
+ fn test_projection_mask_without_nested_nested() {
+ // Schema with nested types (structs)
+ let schema = parse_schema(
+ "
+ message test_schema {
+ OPTIONAL INT32 a;
+ OPTIONAL group b {
+ REQUIRED INT32 b1;
+ OPTIONAL INT64 b2;
+ }
+ OPTIONAL group c (LIST) {
+ REPEATED group list {
+ OPTIONAL INT32 element;
+ }
+ }
+ REQUIRED DOUBLE d;
+ }
+ ",
+ );
+
+ // all leaves --> a, d
+ let mask = ProjectionMask::all();
+ assert_eq!(
+ Some(ProjectionMask::leaves(&schema, [0, 4])),
+ mask.without_nested_types(&schema)
+ );
+
+ // b1 --> empty (it is nested)
+ let mask = ProjectionMask::leaves(&schema, [1]);
+ assert_eq!(None, mask.without_nested_types(&schema));
+
+ // b2, d --> d
+ let mask = ProjectionMask::leaves(&schema, [1, 4]);
+ assert_eq!(
+ Some(ProjectionMask::leaves(&schema, [4])),
+ mask.without_nested_types(&schema)
+ );
+
+ // element --> empty (it is nested)
+ let mask = ProjectionMask::leaves(&schema, [3]);
+ assert_eq!(None, mask.without_nested_types(&schema));
+ }
+
+ #[test]
+ fn test_projection_mask_without_nested_map_only() {
+ // Example from
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
+ let schema = parse_schema(
+ "
+ message test_schema {
+ required group my_map (MAP) {
+ repeated group key_value {
+ required binary key (STRING);
+ optional int32 value;
+ }
+ }
+ }
+ ",
+ );
+
+ let mask = ProjectionMask::all();
+ assert_eq!(None, mask.without_nested_types(&schema));
+
+ // key --> empty (it is nested)
+ let mask = ProjectionMask::leaves(&schema, [0]);
+ assert_eq!(None, mask.without_nested_types(&schema));
+
+ // value --> empty (it is nested)
+ let mask = ProjectionMask::leaves(&schema, [1]);
+ assert_eq!(None, mask.without_nested_types(&schema));
+ }
+
+ #[test]
+ fn test_projection_mask_without_nested_map_with_non_nested() {
+ // Example from
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
+ // with an additional non-nested field
+ let schema = parse_schema(
+ "
+ message test_schema {
+ REQUIRED INT32 a;
+ required group my_map (MAP) {
+ repeated group key_value {
+ required binary key (STRING);
+ optional int32 value;
+ }
+ }
+ REQUIRED INT32 b;
+ }
+ ",
+ );
+
+ // all leaves --> a, b which are the only non nested ones
+ let mask = ProjectionMask::all();
+ assert_eq!(
+ Some(ProjectionMask::leaves(&schema, [0, 3])),
+ mask.without_nested_types(&schema)
+ );
+
+ // key, value, b --> b (the only non-nested one)
+ let mask = ProjectionMask::leaves(&schema, [1, 2, 3]);
+ assert_eq!(
+ Some(ProjectionMask::leaves(&schema, [3])),
+ mask.without_nested_types(&schema)
+ );
+
+ // key, value --> NONE
+ let mask = ProjectionMask::leaves(&schema, [1, 2]);
+ assert_eq!(None, mask.without_nested_types(&schema));
+ }
+
+ #[test]
+ fn test_projection_mask_without_nested_deeply_nested() {
+ // Map of Maps
+ let schema = parse_schema(
+ "
+ message test_schema {
+ OPTIONAL group a (MAP) {
+ REPEATED group key_value {
+ REQUIRED BYTE_ARRAY key (UTF8);
+ OPTIONAL group value (MAP) {
+ REPEATED group key_value {
+ REQUIRED INT32 key;
+ REQUIRED BOOLEAN value;
+ }
+ }
+ }
+ }
+ REQUIRED INT32 b;
+ REQUIRED DOUBLE c;
+ ",
+ );
+
+ let mask = ProjectionMask::all();
+ assert_eq!(
+ Some(ProjectionMask::leaves(&schema, [3, 4])),
+ mask.without_nested_types(&schema)
+ );
+
+ // (first) key, c --> c (the only non-nested one)
+ let mask = ProjectionMask::leaves(&schema, [0, 4]);
+ assert_eq!(
+ Some(ProjectionMask::leaves(&schema, [4])),
+ mask.without_nested_types(&schema)
+ );
+
+ // (second) key, value, b --> b (the only non-nested one)
+ let mask = ProjectionMask::leaves(&schema, [1, 2, 3]);
+ assert_eq!(
+ Some(ProjectionMask::leaves(&schema, [3])),
+ mask.without_nested_types(&schema)
+ );
+
+ // key --> NONE (the only non-nested one)
+ let mask = ProjectionMask::leaves(&schema, [0]);
+ assert_eq!(None, mask.without_nested_types(&schema));
+ }
+
+ #[test]
+ fn test_projection_mask_without_nested_list() {
Review Comment:
this test fails without the code change
##########
parquet/src/arrow/push_decoder/reader_builder/mod.rs:
##########
@@ -641,33 +641,7 @@ impl RowGroupReaderBuilder {
/// Exclude leaves belonging to roots that span multiple parquet leaves
(i.e. nested columns)
fn exclude_nested_columns_from_cache(&self, mask: &ProjectionMask) ->
Option<ProjectionMask> {
- let schema = self.metadata.file_metadata().schema_descr();
- let num_leaves = schema.num_columns();
-
- // Count how many leaves each root column has
- let num_roots = schema.root_schema().get_fields().len();
- let mut root_leaf_counts = vec![0usize; num_roots];
- for leaf_idx in 0..num_leaves {
- let root_idx = schema.get_column_root_idx(leaf_idx);
- root_leaf_counts[root_idx] += 1;
- }
-
- // Keep only leaves whose root has exactly one leaf (non-nested)
- let mut included_leaves = Vec::new();
- for leaf_idx in 0..num_leaves {
- if mask.leaf_included(leaf_idx) {
- let root_idx = schema.get_column_root_idx(leaf_idx);
- if root_leaf_counts[root_idx] == 1 {
- included_leaves.push(leaf_idx);
- }
- }
- }
-
- if included_leaves.is_empty() {
- None
- } else {
- Some(ProjectionMask::leaves(schema, included_leaves))
- }
+ mask.without_nested_types(self.metadata.file_metadata().schema_descr())
Review Comment:
I moved the logic into ProjectionMask
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]