This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new a6fe59592c GH-43994: [C++][Parquet] Fix schema conversion from 
two-level encoding nested list (#43995)
a6fe59592c is described below

commit a6fe59592c643d591ee719041dd2b5d8b9ada7d6
Author: Gang Wu <[email protected]>
AuthorDate: Mon Nov 25 11:48:41 2024 +0800

    GH-43994: [C++][Parquet] Fix schema conversion from two-level encoding 
nested list (#43995)
    
    ### Rationale for this change
    
    The current C++ parquet implementation interprets following parquet schema 
as `array<struct<array:array<int>>>, which is wrong:
    ```
      optional group a (LIST) {
        repeated group array (LIST) {
          repeated int32 array;
        }
      }
    ```
    
    ### What changes are included in this PR?
    
    According to the parquet spec, the above schema should be inferred as 
`array<array<int>>`.
    
    ### Are these changes tested?
    
    Yes, a test case has been added to verify the fix.
    
    ### Are there any user-facing changes?
    
    No.
    * GitHub Issue: #43994
    
    Authored-by: Gang Wu <[email protected]>
    Signed-off-by: Gang Wu <[email protected]>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc |  75 +++++++++++++++
 cpp/src/parquet/arrow/arrow_schema_test.cc        | 111 +++++++++++++++++++++-
 cpp/src/parquet/arrow/schema.cc                   | 103 +++++++++++++-------
 cpp/submodules/parquet-testing                    |   2 +-
 4 files changed, 253 insertions(+), 38 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc 
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 78d272ff24..3a3c1e815e 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -4085,6 +4085,81 @@ TEST(TestArrowReaderAdHoc, OldDataPageV2) {
   TryReadDataFile(path);
 }
 
+TEST(TestArrowReaderAdHoc, LegacyTwoLevelList) {
+  auto VerifyData = [](std::unique_ptr<ParquetFileReader> file_reader) {
+    // Expected Parquet schema of legacy two-level encoding
+    constexpr std::string_view kExpectedLegacyList =
+        "required group field_id=-1 a (List) {\n"
+        "  repeated group field_id=-1 array (List) {\n"
+        "    repeated int32 field_id=-1 array;\n"
+        "  }\n"
+        "}\n";
+
+    // Expected Arrow schema and data
+    auto arrow_inner_list =
+        field("array", list(field("array", ::arrow::int32(), 
/*nullable=*/false)),
+              /*nullable=*/false);
+    auto arrow_outer_list = list(arrow_inner_list);
+    auto arrow_schema =
+        ::arrow::schema({field("a", arrow_outer_list, /*nullable=*/false)});
+    auto expected_table = TableFromJSON(arrow_schema, 
{R"([[[[1,2],[3,4]]]])"});
+
+    // Verify Parquet schema
+    auto root_group = file_reader->metadata()->schema()->group_node();
+    ASSERT_EQ(1, root_group->field_count());
+    std::stringstream nodeStr;
+    PrintSchema(root_group->field(0).get(), nodeStr);
+    ASSERT_EQ(kExpectedLegacyList, nodeStr.str());
+
+    // Verify Arrow schema and data
+    std::unique_ptr<FileReader> reader;
+    ASSERT_OK_NO_THROW(
+        FileReader::Make(default_memory_pool(), std::move(file_reader), 
&reader));
+    std::shared_ptr<Table> table;
+    ASSERT_OK(reader->ReadTable(&table));
+    AssertTablesEqual(*expected_table, *table);
+  };
+
+  // Round-trip test for Parquet C++ reader and writer
+  {
+    // Create Parquet schema of legacy two-level encoding
+    auto inner_list = GroupNode::Make("array", Repetition::REPEATED,
+                                      {schema::Int32("array", 
Repetition::REPEATED)},
+                                      LogicalType::List());
+    auto outer_list =
+        GroupNode::Make("a", Repetition::REQUIRED, {inner_list}, 
LogicalType::List());
+    auto schema_node = GroupNode::Make("schema", Repetition::REQUIRED, 
{outer_list});
+
+    // Create a Parquet writer to write values of nested list
+    auto sink = CreateOutputStream();
+    auto file_writer =
+        ParquetFileWriter::Open(sink, 
std::dynamic_pointer_cast<GroupNode>(schema_node));
+    auto row_group_writer = file_writer->AppendRowGroup();
+    auto int_writer = 
dynamic_cast<Int32Writer*>(row_group_writer->NextColumn());
+    ASSERT_TRUE(int_writer != nullptr);
+
+    // Directly write a single row of nested list: [[1, 2],[3, 4]]
+    constexpr int64_t kNumValues = 4;
+    constexpr std::array<int16_t, kNumValues> kRepLevels = {0, 2, 1, 2};
+    constexpr std::array<int16_t, kNumValues> kDefLevels = {2, 2, 2, 2};
+    constexpr std::array<int32_t, kNumValues> kValues = {1, 2, 3, 4};
+    int_writer->WriteBatch(kNumValues, kDefLevels.data(), kRepLevels.data(),
+                           kValues.data());
+    file_writer->Close();
+    ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
+
+    // Read schema and verify it applies two-level encoding of list type
+    ASSERT_NO_FATAL_FAILURE(
+        
VerifyData(ParquetFileReader::Open(std::make_shared<BufferReader>(buffer))));
+  }
+
+  // Interoperability test for Parquet file generated by parquet-java
+  {
+    auto path = std::string(test::get_data_dir()) + 
"/old_list_structure.parquet";
+    ASSERT_NO_FATAL_FAILURE(VerifyData(ParquetFileReader::OpenFile(path)));
+  }
+}
+
 class TestArrowReaderAdHocSparkAndHvr
     : public ::testing::TestWithParam<
           std::tuple<std::string, std::shared_ptr<DataType>>> {};
diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc 
b/cpp/src/parquet/arrow/arrow_schema_test.cc
index d261482d89..a6e04e5425 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -18,6 +18,7 @@
 #include <memory>
 #include <vector>
 
+#include "gmock/gmock-matchers.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -601,6 +602,58 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
     arrow_fields.push_back(::arrow::field("name", arrow_list, false));
   }
 
+  // Two-level encoding List<List<Integer>>:
+  // optional group my_list (LIST) {
+  //   repeated group array (LIST) {
+  //     repeated int32 array;
+  //   }
+  // }
+  {
+    auto inner_array =
+        PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32);
+    auto outer_array = GroupNode::Make("array", Repetition::REPEATED, 
{inner_array},
+                                       ConvertedType::LIST);
+    parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL,
+                                             {outer_array}, 
ConvertedType::LIST));
+    auto arrow_inner_array = ::arrow::field("array", INT32, 
/*nullable=*/false);
+    auto arrow_outer_array =
+        ::arrow::field("array", ::arrow::list(arrow_inner_array), 
/*nullable=*/false);
+    auto arrow_list = ::arrow::list(arrow_outer_array);
+    arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+  }
+
+  // List<Map<String, String>> in three-level list encoding:
+  // optional group my_list (LIST) {
+  //   repeated group list {
+  //     required group element (MAP) {
+  //       repeated group key_value {
+  //         required binary key (STRING);
+  //         optional binary value (STRING);
+  //       }
+  //     }
+  //   }
+  // }
+  {
+    auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, 
ParquetType::BYTE_ARRAY,
+                                   ConvertedType::UTF8);
+    auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
+                                     ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+    auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, 
value});
+    auto element =
+        GroupNode::Make("element", Repetition::REQUIRED, {key_value}, 
ConvertedType::MAP);
+    auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+    parquet_fields.push_back(
+        GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, 
ConvertedType::LIST));
+
+    auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
+    auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
+    auto arrow_element = ::arrow::field(
+        "element", std::make_shared<::arrow::MapType>(arrow_key, arrow_value),
+        /*nullable=*/false);
+    auto arrow_list = ::arrow::list(arrow_element);
+    arrow_fields.push_back(::arrow::field("my_list", arrow_list, 
/*nullable=*/true));
+  }
+
   auto arrow_schema = ::arrow::schema(arrow_fields);
   ASSERT_OK(ConvertSchema(parquet_fields));
 
@@ -727,6 +780,60 @@ TEST_F(TestConvertParquetSchema, 
ParquetRepeatedNestedSchema) {
   ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
 }
 
+TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
+  // List<Map<String, String>> in two-level list encoding:
+  //
+  // optional group my_list (LIST) {
+  //   repeated group array (MAP) {
+  //     repeated group key_value {
+  //       required binary key (STRING);
+  //       optional binary value (STRING);
+  //     }
+  //   }
+  // }
+  {
+    auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, 
ParquetType::BYTE_ARRAY,
+                                   ConvertedType::UTF8);
+    auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
+                                     ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+    auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, 
value});
+    auto array =
+        GroupNode::Make("array", Repetition::REPEATED, {key_value}, 
ConvertedType::MAP);
+    std::vector<NodePtr> parquet_fields;
+    parquet_fields.push_back(
+        GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, 
ConvertedType::LIST));
+
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid,
+        testing::HasSubstr("Group with one repeated child must be 
LIST-annotated."),
+        ConvertSchema(parquet_fields));
+  }
+
+  // List<List<String>>: outer list is two-level encoding, inner list is 
three-level
+  //
+  // optional group my_list (LIST) {
+  //   repeated group array (LIST) {
+  //     repeated group list {
+  //       required binary element (STRING);
+  //     }
+  //   }
+  // }
+  {
+    auto element = PrimitiveNode::Make("element", Repetition::REQUIRED,
+                                       ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+    auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+    auto array =
+        GroupNode::Make("array", Repetition::REPEATED, {list}, 
ConvertedType::LIST);
+    std::vector<NodePtr> parquet_fields;
+    parquet_fields.push_back(
+        GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, 
ConvertedType::LIST));
+
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid, testing::HasSubstr("LIST-annotated groups must not be 
repeated."),
+        ConvertSchema(parquet_fields));
+  }
+}
+
 Status ArrowSchemaToParquetMetadata(std::shared_ptr<::arrow::Schema>& 
arrow_schema,
                                     std::shared_ptr<KeyValueMetadata>& 
metadata) {
   ARROW_ASSIGN_OR_RAISE(
@@ -1846,7 +1953,9 @@ TEST_F(TestLevels, ListErrors) {
   {
     ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make(
         "child_list", Repetition::REPEATED,
-        {PrimitiveNode::Make("bool", Repetition::REPEATED, 
ParquetType::BOOLEAN)},
+        {GroupNode::Make("list", Repetition::REPEATED,
+                         {PrimitiveNode::Make("element", Repetition::REQUIRED,
+                                              ParquetType::BOOLEAN)})},
         LogicalType::List()));
     ASSERT_RAISES(Invalid, error);
     std::string expected("LIST-annotated groups must not be repeated.");
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index 0d009c8d4f..0ee595508f 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -513,11 +513,13 @@ Status PopulateLeaf(int column_index, const 
std::shared_ptr<Field>& field,
 }
 
 // Special case mentioned in the format spec:
-//   If the name is array or ends in _tuple, this should be a list of struct
-//   even for single child elements.
-bool HasStructListName(const GroupNode& node) {
+//   If the name is array or uses the parent's name with `_tuple` appended,
+//   this should be:
+//   - a list of list or map type if the repeated group node is LIST- or 
MAP-annotated.
+//   - otherwise, a list of struct even for single child elements.
+bool HasListElementName(const GroupNode& node, const GroupNode& parent) {
   ::std::string_view name{node.name()};
-  return name == "array" || EndsWith(name, "_tuple");
+  return name == "array" || name == (parent.name() + "_tuple");
 }
 
 Status GroupToStruct(const GroupNode& node, LevelInfo current_levels,
@@ -598,9 +600,9 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo 
current_levels,
   ctx->LinkParent(value_field, key_value_field);
 
   // required/optional group name=whatever {
-  //   repeated group name=key_values{
+  //   repeated group name=key_values {
   //     required TYPE key;
-  // required/optional TYPE value;
+  //     required/optional TYPE value;
   //   }
   // }
   //
@@ -634,6 +636,7 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo 
current_levels,
   if (group.is_repeated()) {
     return Status::Invalid("LIST-annotated groups must not be repeated.");
   }
+
   current_levels.Increment(group);
 
   out->children.resize(group.field_count());
@@ -651,38 +654,62 @@ Status ListToSchemaField(const GroupNode& group, 
LevelInfo current_levels,
 
   int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
   if (list_node.is_group()) {
-    // Resolve 3-level encoding
-    //
-    // required/optional group name=whatever {
-    //   repeated group name=list {
-    //     required/optional TYPE item;
-    //   }
-    // }
-    //
-    // yields list<item: TYPE ?nullable> ?nullable
-    //
-    // We distinguish the special case that we have
-    //
-    // required/optional group name=whatever {
-    //   repeated group name=array or $SOMETHING_tuple {
-    //     required/optional TYPE item;
-    //   }
-    // }
-    //
-    // In this latter case, the inner type of the list should be a struct
-    // rather than a primitive value
-    //
-    // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
     const auto& list_group = static_cast<const GroupNode&>(list_node);
-    // Special case mentioned in the format spec:
-    //   If the name is array or ends in _tuple, this should be a list of 
struct
-    //   even for single child elements.
-    if (list_group.field_count() == 1 && !HasStructListName(list_group)) {
-      // List of primitive type
-      RETURN_NOT_OK(
-          NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, 
child_field));
-    } else {
+    if (list_group.field_count() > 1) {
+      // The inner type of the list should be a struct when there are multiple 
fields
+      // in the repeated group
       RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, 
child_field));
+    } else if (list_group.field_count() == 1) {
+      const auto& repeated_field = list_group.field(0);
+      if (repeated_field->is_repeated()) {
+        // Special case where the inner type might be a list with two-level 
encoding
+        // like below:
+        //
+        // required/optional group name=SOMETHING (LIST) {
+        //   repeated group array (LIST) {
+        //     repeated TYPE item;
+        //   }
+        // }
+        //
+        // yields list<item: list<item: TYPE not null> not null> ?nullable
+        if (!list_group.logical_type()->is_list()) {
+          return Status::Invalid("Group with one repeated child must be 
LIST-annotated.");
+        }
+        // LIST-annotated group with three-level encoding cannot be repeated.
+        if (repeated_field->is_group() &&
+            !static_cast<const 
GroupNode&>(*repeated_field).field(0)->is_repeated()) {
+          return Status::Invalid("LIST-annotated groups must not be 
repeated.");
+        }
+        RETURN_NOT_OK(
+            NodeToSchemaField(*repeated_field, current_levels, ctx, out, 
child_field));
+      } else if (HasListElementName(list_group, group)) {
+        // We distinguish the special case that we have
+        //
+        // required/optional group name=SOMETHING {
+        //   repeated group name=array or $SOMETHING_tuple {
+        //     required/optional TYPE item;
+        //   }
+        // }
+        //
+        // The inner type of the list should be a struct rather than a 
primitive value
+        //
+        // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
+        RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, 
child_field));
+      } else {
+        // Resolve 3-level encoding
+        //
+        // required/optional group name=whatever {
+        //   repeated group name=list {
+        //     required/optional TYPE item;
+        //   }
+        // }
+        //
+        // yields list<item: TYPE ?nullable> ?nullable
+        RETURN_NOT_OK(
+            NodeToSchemaField(*repeated_field, current_levels, ctx, out, 
child_field));
+      }
+    } else {
+      return Status::Invalid("Group must have at least one child.");
     }
   } else {
     // Two-level list encoding
@@ -690,6 +717,10 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo 
current_levels,
     // required/optional group LIST {
     //   repeated TYPE;
     // }
+    //
+    // TYPE is a primitive type
+    //
+    // yields list<item: TYPE not null> ?nullable
     const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
     int column_index = ctx->schema->GetColumnIndex(primitive_node);
     ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index cb7a967414..a7f1d288e6 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit cb7a9674142c137367bf75a01b79c6e214a73199
+Subproject commit a7f1d288e693dbb08e3199851c4eb2140ff8dff2

Reply via email to