wgtmac commented on code in PR #49125:
URL: https://github.com/apache/arrow/pull/49125#discussion_r2785621877


##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -623,7 +624,10 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo 
current_levels,
   if (group.field_count() != 1) {
     return Status::Invalid("MAP-annotated groups must have a single child.");
   }
-  if (group.is_repeated()) {
+  // MAP-annotated groups cannot be repeated unless in a legacy two-level 
LIST-annotated
+  // group.

Review Comment:
   ```suggestion
     // MAP-annotated groups cannot be repeated unless served as an
     // element within a LIST-annotated group with 2-level structure.
   ```



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -697,41 +703,53 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo 
current_levels,
   return Status::OK();
 }
 
-Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
-                         SchemaTreeContext* ctx, const SchemaField* parent,
-                         SchemaField* out) {
-  if (group.field_count() != 1) {
-    return Status::Invalid("LIST-annotated groups must have a single child.");
-  }
-  if (group.is_repeated()) {
-    return Status::Invalid("LIST-annotated groups must not be repeated.");
-  }
-
-  current_levels.Increment(group);
+Status ResolveList(const GroupNode& group, const Node& list_node,
+                   LevelInfo current_levels, SchemaTreeContext* ctx, 
SchemaField* out) {
+  auto check_two_level_list_repeated = [](const GroupNode& group) -> Status {
+    // When it is repeated, the LIST-annotated 2-level structure can only 
serve as an
+    // element within another LIST-annotated 2-level structure.
+    if (group.is_repeated() &&
+        (group.parent() == nullptr || 
!group.parent()->logical_type()->is_list())) {
+      return Status::Invalid("LIST-annotated groups must not be repeated.");
+    }
+    return {};
+  };
 
-  out->children.resize(group.field_count());
   SchemaField* child_field = &out->children[0];
 
-  ctx->LinkParent(out, parent);
-  ctx->LinkParent(child_field, out);
-
-  const Node& list_node = *group.field(0);
-
-  if (!list_node.is_repeated()) {
-    return Status::Invalid(
-        "Non-repeated nodes in a LIST-annotated group are not supported.");
-  }
-
-  int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
   if (list_node.is_group()) {
     const auto& list_group = static_cast<const GroupNode&>(list_node);
     if (list_group.field_count() > 1) {
       // The inner type of the list should be a struct when there are multiple 
fields
       // in the repeated group
-      RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, 
child_field));
-    } else if (list_group.field_count() == 1) {
-      const auto& repeated_field = list_group.field(0);
-      if (repeated_field->is_repeated()) {
+      RETURN_NOT_OK(check_two_level_list_repeated(group));

Review Comment:
   Do we need to validate that `list_group` has no logical type annotation?



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -697,17 +703,77 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo 
current_levels,
   return Status::OK();
 }
 
+Status ResolveList(const GroupNode& group, const Node& list_node,
+                   LevelInfo current_levels, SchemaTreeContext* ctx, 
SchemaField* out) {
+  auto check_two_level_list_repeated = [](const GroupNode& group) -> Status {

Review Comment:
   ```suggestion
     auto check_two_level_list_repetition = [](const GroupNode& group) -> 
Status {
   ```



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -844,34 +900,6 @@ TEST_F(TestConvertParquetSchema, 
ParquetRepeatedNestedSchema) {
 }
 
 TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
-  // List<Map<String, String>> in two-level list encoding:

Review Comment:
   Do we want to add some illegal cases like below to verify that two-level 
list-annotated group cannot be repeated:
   
   ```
     repeated group my_list (LIST) {
       repeated int32 array;
     }
   
     required group my_struct {
       repeated group my_list (LIST) {
         repeated int32 array;
       }
     }
   ```



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -697,41 +703,53 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo 
current_levels,
   return Status::OK();
 }
 
-Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
-                         SchemaTreeContext* ctx, const SchemaField* parent,
-                         SchemaField* out) {
-  if (group.field_count() != 1) {
-    return Status::Invalid("LIST-annotated groups must have a single child.");
-  }
-  if (group.is_repeated()) {
-    return Status::Invalid("LIST-annotated groups must not be repeated.");
-  }
-
-  current_levels.Increment(group);
+Status ResolveList(const GroupNode& group, const Node& list_node,
+                   LevelInfo current_levels, SchemaTreeContext* ctx, 
SchemaField* out) {
+  auto check_two_level_list_repeated = [](const GroupNode& group) -> Status {
+    // When it is repeated, the LIST-annotated 2-level structure can only 
serve as an
+    // element within another LIST-annotated 2-level structure.
+    if (group.is_repeated() &&
+        (group.parent() == nullptr || 
!group.parent()->logical_type()->is_list())) {
+      return Status::Invalid("LIST-annotated groups must not be repeated.");
+    }
+    return {};
+  };
 
-  out->children.resize(group.field_count());
   SchemaField* child_field = &out->children[0];
 
-  ctx->LinkParent(out, parent);
-  ctx->LinkParent(child_field, out);
-
-  const Node& list_node = *group.field(0);
-
-  if (!list_node.is_repeated()) {
-    return Status::Invalid(
-        "Non-repeated nodes in a LIST-annotated group are not supported.");
-  }
-
-  int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
   if (list_node.is_group()) {
     const auto& list_group = static_cast<const GroupNode&>(list_node);
     if (list_group.field_count() > 1) {
       // The inner type of the list should be a struct when there are multiple 
fields
       // in the repeated group
-      RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, 
child_field));
-    } else if (list_group.field_count() == 1) {
-      const auto& repeated_field = list_group.field(0);
-      if (repeated_field->is_repeated()) {
+      RETURN_NOT_OK(check_two_level_list_repeated(group));
+      return GroupToStruct(list_group, current_levels, ctx, out, child_field);
+    }
+    if (list_group.field_count() == 0) {
+      return Status::Invalid("Group must have at least one child.");
+    }
+
+    if (list_group.logical_type()->is_none() && HasListElementName(list_group, 
group)) {

Review Comment:
   Now this is more strict than before because of the additional logical type 
check.



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -681,6 +707,36 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
       arrow_fields.push_back(::arrow::field("my_list", arrow_list, 
/*nullable=*/true));
     }
 
+    // List<Map<String, String>> in two-level list encoding:
+    //
+    // optional group my_list (LIST) {
+    //   repeated group array (MAP) {
+    //     repeated group key_value {
+    //       required binary key (STRING);
+    //       optional binary value (STRING);
+    //     }
+    //   }
+    // }

Review Comment:
   Do we want to add a case for the below which is still produced by 
parquet-java by default:
   
   ```
       // optional group my_list (LIST) {
       //   repeated group array (MAP) {
       //     repeated group key_value (MAP_KEY_VALUE) {
       //       required binary key (STRING);
       //       optional binary value (STRING);
       //     }
       //   }
       // }
   ```



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -697,17 +703,77 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo 
current_levels,
   return Status::OK();
 }
 
+Status ResolveList(const GroupNode& group, const Node& list_node,
+                   LevelInfo current_levels, SchemaTreeContext* ctx, 
SchemaField* out) {
+  auto check_two_level_list_repeated = [](const GroupNode& group) -> Status {
+    // When it is repeated, the LIST-annotated 2-level structure can only 
serve as an
+    // element within another LIST-annotated 2-level structure.
+    if (group.is_repeated() &&
+        (group.parent() == nullptr || 
!group.parent()->logical_type()->is_list())) {
+      return Status::Invalid("LIST-annotated groups must not be repeated.");
+    }
+    return {};
+  };
+
+  SchemaField* child_field = &out->children[0];
+
+  if (list_node.is_group()) {
+    const auto& list_group = static_cast<const GroupNode&>(list_node);
+    if (list_group.field_count() > 1) {
+      // The inner type of the list should be a struct when there are multiple 
fields
+      // in the repeated group
+      RETURN_NOT_OK(check_two_level_list_repeated(group));
+      return GroupToStruct(list_group, current_levels, ctx, out, child_field);
+    }
+    if (list_group.field_count() == 0) {
+      return Status::Invalid("Group must have at least one child.");
+    }
+
+    if (list_group.logical_type()->is_none() && HasListElementName(list_group, 
group)) {
+      // Rule 4 at 
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
+      RETURN_NOT_OK(check_two_level_list_repeated(group));
+      return GroupToStruct(list_group, current_levels, ctx, out, child_field);
+    }
+
+    const auto& repeated_field = list_group.field(0);
+    if (!list_group.logical_type()->is_none() || 
repeated_field->is_repeated()) {
+      RETURN_NOT_OK(check_two_level_list_repeated(group));
+      if (list_group.logical_type()->is_list()) {
+        return ListToSchemaField(list_group, current_levels, ctx, out, 
child_field);
+      } else if (list_group.logical_type()->is_map()) {
+        return MapToSchemaField(list_group, current_levels, ctx, out, 
child_field);
+      } else {
+        return GroupToStruct(list_group, current_levels, ctx, out, 
child_field);
+      }
+    }
+
+    if (group.is_repeated()) {
+      return Status::Invalid("LIST-annotated groups must not be repeated.");
+    }
+
+    return NodeToSchemaField(*repeated_field, current_levels, ctx, out, 
child_field);
+  }
+
+  RETURN_NOT_OK(check_two_level_list_repeated(group));

Review Comment:
   Should we validate that `list_node` is repeated?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to