This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new a6fe59592c GH-43994: [C++][Parquet] Fix schema conversion from
two-level encoding nested list (#43995)
a6fe59592c is described below
commit a6fe59592c643d591ee719041dd2b5d8b9ada7d6
Author: Gang Wu <[email protected]>
AuthorDate: Mon Nov 25 11:48:41 2024 +0800
GH-43994: [C++][Parquet] Fix schema conversion from two-level encoding
nested list (#43995)
### Rationale for this change
The current C++ parquet implementation interprets following parquet schema
as `array<struct<array:array<int>>>, which is wrong:
```
optional group a (LIST) {
repeated group array (LIST) {
repeated int32 array;
}
}
```
### What changes are included in this PR?
According to the parquet spec, the above schema should be inferred as
`array<array<int>>`.
### Are these changes tested?
Yes, a test case has been added to verify the fix.
### Are there any user-facing changes?
No.
* GitHub Issue: #43994
Authored-by: Gang Wu <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
---
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 75 +++++++++++++++
cpp/src/parquet/arrow/arrow_schema_test.cc | 111 +++++++++++++++++++++-
cpp/src/parquet/arrow/schema.cc | 103 +++++++++++++-------
cpp/submodules/parquet-testing | 2 +-
4 files changed, 253 insertions(+), 38 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 78d272ff24..3a3c1e815e 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -4085,6 +4085,81 @@ TEST(TestArrowReaderAdHoc, OldDataPageV2) {
TryReadDataFile(path);
}
+TEST(TestArrowReaderAdHoc, LegacyTwoLevelList) {
+ auto VerifyData = [](std::unique_ptr<ParquetFileReader> file_reader) {
+ // Expected Parquet schema of legacy two-level encoding
+ constexpr std::string_view kExpectedLegacyList =
+ "required group field_id=-1 a (List) {\n"
+ " repeated group field_id=-1 array (List) {\n"
+ " repeated int32 field_id=-1 array;\n"
+ " }\n"
+ "}\n";
+
+ // Expected Arrow schema and data
+ auto arrow_inner_list =
+ field("array", list(field("array", ::arrow::int32(),
/*nullable=*/false)),
+ /*nullable=*/false);
+ auto arrow_outer_list = list(arrow_inner_list);
+ auto arrow_schema =
+ ::arrow::schema({field("a", arrow_outer_list, /*nullable=*/false)});
+ auto expected_table = TableFromJSON(arrow_schema,
{R"([[[[1,2],[3,4]]]])"});
+
+ // Verify Parquet schema
+ auto root_group = file_reader->metadata()->schema()->group_node();
+ ASSERT_EQ(1, root_group->field_count());
+ std::stringstream nodeStr;
+ PrintSchema(root_group->field(0).get(), nodeStr);
+ ASSERT_EQ(kExpectedLegacyList, nodeStr.str());
+
+ // Verify Arrow schema and data
+ std::unique_ptr<FileReader> reader;
+ ASSERT_OK_NO_THROW(
+ FileReader::Make(default_memory_pool(), std::move(file_reader),
&reader));
+ std::shared_ptr<Table> table;
+ ASSERT_OK(reader->ReadTable(&table));
+ AssertTablesEqual(*expected_table, *table);
+ };
+
+ // Round-trip test for Parquet C++ reader and writer
+ {
+ // Create Parquet schema of legacy two-level encoding
+ auto inner_list = GroupNode::Make("array", Repetition::REPEATED,
+ {schema::Int32("array",
Repetition::REPEATED)},
+ LogicalType::List());
+ auto outer_list =
+ GroupNode::Make("a", Repetition::REQUIRED, {inner_list},
LogicalType::List());
+ auto schema_node = GroupNode::Make("schema", Repetition::REQUIRED,
{outer_list});
+
+ // Create a Parquet writer to write values of nested list
+ auto sink = CreateOutputStream();
+ auto file_writer =
+ ParquetFileWriter::Open(sink,
std::dynamic_pointer_cast<GroupNode>(schema_node));
+ auto row_group_writer = file_writer->AppendRowGroup();
+ auto int_writer =
dynamic_cast<Int32Writer*>(row_group_writer->NextColumn());
+ ASSERT_TRUE(int_writer != nullptr);
+
+ // Directly write a single row of nested list: [[1, 2],[3, 4]]
+ constexpr int64_t kNumValues = 4;
+ constexpr std::array<int16_t, kNumValues> kRepLevels = {0, 2, 1, 2};
+ constexpr std::array<int16_t, kNumValues> kDefLevels = {2, 2, 2, 2};
+ constexpr std::array<int32_t, kNumValues> kValues = {1, 2, 3, 4};
+ int_writer->WriteBatch(kNumValues, kDefLevels.data(), kRepLevels.data(),
+ kValues.data());
+ file_writer->Close();
+ ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
+
+ // Read schema and verify it applies two-level encoding of list type
+ ASSERT_NO_FATAL_FAILURE(
+
VerifyData(ParquetFileReader::Open(std::make_shared<BufferReader>(buffer))));
+ }
+
+ // Interoperability test for Parquet file generated by parquet-java
+ {
+ auto path = std::string(test::get_data_dir()) +
"/old_list_structure.parquet";
+ ASSERT_NO_FATAL_FAILURE(VerifyData(ParquetFileReader::OpenFile(path)));
+ }
+}
+
class TestArrowReaderAdHocSparkAndHvr
: public ::testing::TestWithParam<
std::tuple<std::string, std::shared_ptr<DataType>>> {};
diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc
b/cpp/src/parquet/arrow/arrow_schema_test.cc
index d261482d89..a6e04e5425 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -18,6 +18,7 @@
#include <memory>
#include <vector>
+#include "gmock/gmock-matchers.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
@@ -601,6 +602,58 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
arrow_fields.push_back(::arrow::field("name", arrow_list, false));
}
+ // Two-level encoding List<List<Integer>>:
+ // optional group my_list (LIST) {
+ // repeated group array (LIST) {
+ // repeated int32 array;
+ // }
+ // }
+ {
+ auto inner_array =
+ PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32);
+ auto outer_array = GroupNode::Make("array", Repetition::REPEATED,
{inner_array},
+ ConvertedType::LIST);
+ parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL,
+ {outer_array},
ConvertedType::LIST));
+ auto arrow_inner_array = ::arrow::field("array", INT32,
/*nullable=*/false);
+ auto arrow_outer_array =
+ ::arrow::field("array", ::arrow::list(arrow_inner_array),
/*nullable=*/false);
+ auto arrow_list = ::arrow::list(arrow_outer_array);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+ }
+
+ // List<Map<String, String>> in three-level list encoding:
+ // optional group my_list (LIST) {
+ // repeated group list {
+ // required group element (MAP) {
+ // repeated group key_value {
+ // required binary key (STRING);
+ // optional binary value (STRING);
+ // }
+ // }
+ // }
+ // }
+ {
+ auto key = PrimitiveNode::Make("key", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY,
+ ConvertedType::UTF8);
+ auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key,
value});
+ auto element =
+ GroupNode::Make("element", Repetition::REQUIRED, {key_value},
ConvertedType::MAP);
+ auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+ parquet_fields.push_back(
+ GroupNode::Make("my_list", Repetition::OPTIONAL, {list},
ConvertedType::LIST));
+
+ auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
+ auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
+ auto arrow_element = ::arrow::field(
+ "element", std::make_shared<::arrow::MapType>(arrow_key, arrow_value),
+ /*nullable=*/false);
+ auto arrow_list = ::arrow::list(arrow_element);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list,
/*nullable=*/true));
+ }
+
auto arrow_schema = ::arrow::schema(arrow_fields);
ASSERT_OK(ConvertSchema(parquet_fields));
@@ -727,6 +780,60 @@ TEST_F(TestConvertParquetSchema,
ParquetRepeatedNestedSchema) {
ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
}
+TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
+ // List<Map<String, String>> in two-level list encoding:
+ //
+ // optional group my_list (LIST) {
+ // repeated group array (MAP) {
+ // repeated group key_value {
+ // required binary key (STRING);
+ // optional binary value (STRING);
+ // }
+ // }
+ // }
+ {
+ auto key = PrimitiveNode::Make("key", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY,
+ ConvertedType::UTF8);
+ auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key,
value});
+ auto array =
+ GroupNode::Make("array", Repetition::REPEATED, {key_value},
ConvertedType::MAP);
+ std::vector<NodePtr> parquet_fields;
+ parquet_fields.push_back(
+ GroupNode::Make("my_list", Repetition::OPTIONAL, {array},
ConvertedType::LIST));
+
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ testing::HasSubstr("Group with one repeated child must be
LIST-annotated."),
+ ConvertSchema(parquet_fields));
+ }
+
+ // List<List<String>>: outer list is two-level encoding, inner list is
three-level
+ //
+ // optional group my_list (LIST) {
+ // repeated group array (LIST) {
+ // repeated group list {
+ // required binary element (STRING);
+ // }
+ // }
+ // }
+ {
+ auto element = PrimitiveNode::Make("element", Repetition::REQUIRED,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
+ auto array =
+ GroupNode::Make("array", Repetition::REPEATED, {list},
ConvertedType::LIST);
+ std::vector<NodePtr> parquet_fields;
+ parquet_fields.push_back(
+ GroupNode::Make("my_list", Repetition::OPTIONAL, {array},
ConvertedType::LIST));
+
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid, testing::HasSubstr("LIST-annotated groups must not be
repeated."),
+ ConvertSchema(parquet_fields));
+ }
+}
+
Status ArrowSchemaToParquetMetadata(std::shared_ptr<::arrow::Schema>&
arrow_schema,
std::shared_ptr<KeyValueMetadata>&
metadata) {
ARROW_ASSIGN_OR_RAISE(
@@ -1846,7 +1953,9 @@ TEST_F(TestLevels, ListErrors) {
{
::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make(
"child_list", Repetition::REPEATED,
- {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ {GroupNode::Make("list", Repetition::REPEATED,
+ {PrimitiveNode::Make("element", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})},
LogicalType::List()));
ASSERT_RAISES(Invalid, error);
std::string expected("LIST-annotated groups must not be repeated.");
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index 0d009c8d4f..0ee595508f 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -513,11 +513,13 @@ Status PopulateLeaf(int column_index, const
std::shared_ptr<Field>& field,
}
// Special case mentioned in the format spec:
-// If the name is array or ends in _tuple, this should be a list of struct
-// even for single child elements.
-bool HasStructListName(const GroupNode& node) {
+// If the name is array or uses the parent's name with `_tuple` appended,
+// this should be:
+// - a list of list or map type if the repeated group node is LIST- or
MAP-annotated.
+// - otherwise, a list of struct even for single child elements.
+bool HasListElementName(const GroupNode& node, const GroupNode& parent) {
::std::string_view name{node.name()};
- return name == "array" || EndsWith(name, "_tuple");
+ return name == "array" || name == (parent.name() + "_tuple");
}
Status GroupToStruct(const GroupNode& node, LevelInfo current_levels,
@@ -598,9 +600,9 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo
current_levels,
ctx->LinkParent(value_field, key_value_field);
// required/optional group name=whatever {
- // repeated group name=key_values{
+ // repeated group name=key_values {
// required TYPE key;
- // required/optional TYPE value;
+ // required/optional TYPE value;
// }
// }
//
@@ -634,6 +636,7 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo
current_levels,
if (group.is_repeated()) {
return Status::Invalid("LIST-annotated groups must not be repeated.");
}
+
current_levels.Increment(group);
out->children.resize(group.field_count());
@@ -651,38 +654,62 @@ Status ListToSchemaField(const GroupNode& group,
LevelInfo current_levels,
int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
if (list_node.is_group()) {
- // Resolve 3-level encoding
- //
- // required/optional group name=whatever {
- // repeated group name=list {
- // required/optional TYPE item;
- // }
- // }
- //
- // yields list<item: TYPE ?nullable> ?nullable
- //
- // We distinguish the special case that we have
- //
- // required/optional group name=whatever {
- // repeated group name=array or $SOMETHING_tuple {
- // required/optional TYPE item;
- // }
- // }
- //
- // In this latter case, the inner type of the list should be a struct
- // rather than a primitive value
- //
- // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
const auto& list_group = static_cast<const GroupNode&>(list_node);
- // Special case mentioned in the format spec:
- // If the name is array or ends in _tuple, this should be a list of
struct
- // even for single child elements.
- if (list_group.field_count() == 1 && !HasStructListName(list_group)) {
- // List of primitive type
- RETURN_NOT_OK(
- NodeToSchemaField(*list_group.field(0), current_levels, ctx, out,
child_field));
- } else {
+ if (list_group.field_count() > 1) {
+ // The inner type of the list should be a struct when there are multiple
fields
+ // in the repeated group
RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out,
child_field));
+ } else if (list_group.field_count() == 1) {
+ const auto& repeated_field = list_group.field(0);
+ if (repeated_field->is_repeated()) {
+ // Special case where the inner type might be a list with two-level
encoding
+ // like below:
+ //
+ // required/optional group name=SOMETHING (LIST) {
+ // repeated group array (LIST) {
+ // repeated TYPE item;
+ // }
+ // }
+ //
+ // yields list<item: list<item: TYPE not null> not null> ?nullable
+ if (!list_group.logical_type()->is_list()) {
+ return Status::Invalid("Group with one repeated child must be
LIST-annotated.");
+ }
+ // LIST-annotated group with three-level encoding cannot be repeated.
+ if (repeated_field->is_group() &&
+ !static_cast<const
GroupNode&>(*repeated_field).field(0)->is_repeated()) {
+ return Status::Invalid("LIST-annotated groups must not be
repeated.");
+ }
+ RETURN_NOT_OK(
+ NodeToSchemaField(*repeated_field, current_levels, ctx, out,
child_field));
+ } else if (HasListElementName(list_group, group)) {
+ // We distinguish the special case that we have
+ //
+ // required/optional group name=SOMETHING {
+ // repeated group name=array or $SOMETHING_tuple {
+ // required/optional TYPE item;
+ // }
+ // }
+ //
+ // The inner type of the list should be a struct rather than a
primitive value
+ //
+ // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
+ RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out,
child_field));
+ } else {
+ // Resolve 3-level encoding
+ //
+ // required/optional group name=whatever {
+ // repeated group name=list {
+ // required/optional TYPE item;
+ // }
+ // }
+ //
+ // yields list<item: TYPE ?nullable> ?nullable
+ RETURN_NOT_OK(
+ NodeToSchemaField(*repeated_field, current_levels, ctx, out,
child_field));
+ }
+ } else {
+ return Status::Invalid("Group must have at least one child.");
}
} else {
// Two-level list encoding
@@ -690,6 +717,10 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo
current_levels,
// required/optional group LIST {
// repeated TYPE;
// }
+ //
+ // TYPE is a primitive type
+ //
+ // yields list<item: TYPE not null> ?nullable
const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
int column_index = ctx->schema->GetColumnIndex(primitive_node);
ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index cb7a967414..a7f1d288e6 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit cb7a9674142c137367bf75a01b79c6e214a73199
+Subproject commit a7f1d288e693dbb08e3199851c4eb2140ff8dff2