wgtmac commented on code in PR #43995:
URL: https://github.com/apache/arrow/pull/43995#discussion_r1814615707
##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -4093,6 +4093,81 @@ TEST(TestArrowReaderAdHoc, OldDataPageV2) {
TryReadDataFile(path);
}
+TEST(TestArrowReaderAdHoc, LegacyTwoLevelList) {
+ auto VerifyData = [](std::unique_ptr<ParquetFileReader> file_reader) {
+ // Expected Parquet schema of legacy two-level encoding
+ constexpr std::string_view kExpectedLegacyList =
+ "required group field_id=-1 a (List) {\n"
+ " repeated group field_id=-1 array (List) {\n"
+ " repeated int32 field_id=-1 array;\n"
+ " }\n"
+ "}\n";
+
+ // Expected Arrow schema and data
+ auto arrow_inner_list =
+ field("array", list(field("array", ::arrow::int32(),
/*nullable=*/false)),
+ /*nullable=*/false);
+ auto arrow_outer_list = list(arrow_inner_list);
+ auto arrow_schema =
+ ::arrow::schema({field("a", arrow_outer_list, /*nullable=*/false)});
+ auto expected_table = TableFromJSON(arrow_schema,
{R"([[[[1,2],[3,4]]]])"});
+
+ // Verify Parquet schema
+ auto root_group = file_reader->metadata()->schema()->group_node();
+ ASSERT_EQ(1, root_group->field_count());
+ std::stringstream nodeStr;
+ PrintSchema(root_group->field(0).get(), nodeStr);
+ ASSERT_EQ(kExpectedLegacyList, nodeStr.str());
+
+ // Verify Arrow schema and data
+ std::unique_ptr<FileReader> reader;
+ ASSERT_OK_NO_THROW(
+ FileReader::Make(default_memory_pool(), std::move(file_reader),
&reader));
+ std::shared_ptr<Table> table;
+ ASSERT_OK(reader->ReadTable(&table));
+ AssertTablesEqual(*expected_table, *table);
+ };
+
+ // Round-trip test for Parquet C++ reader and writer
+ {
+ // Create Parquet schema of legacy two-level encoding
+ auto inner_list = GroupNode::Make("array", Repetition::REPEATED,
+ {schema::Int32("array",
Repetition::REPEATED)},
+ LogicalType::List());
+ auto outer_list =
+ GroupNode::Make("a", Repetition::REQUIRED, {inner_list},
LogicalType::List());
+ auto schema_node = GroupNode::Make("schema", Repetition::REQUIRED,
{outer_list});
+
+ // Create a Parquet writer to write values of nested list
+ auto sink = CreateOutputStream();
+ auto file_writer =
+ ParquetFileWriter::Open(sink,
std::dynamic_pointer_cast<GroupNode>(schema_node));
+ auto row_group_writer = file_writer->AppendRowGroup();
+ auto int_writer =
dynamic_cast<Int32Writer*>(row_group_writer->NextColumn());
+ ASSERT_TRUE(int_writer != nullptr);
+
+ // Directly write a single row of nested list: [[1, 2],[3, 4]]
+ constexpr int64_t kNumValues = 4;
+ constexpr std::array<int16_t, kNumValues> kRepLevels = {0, 2, 1, 2};
+ constexpr std::array<int16_t, kNumValues> kDefLevels = {2, 2, 2, 2};
+ constexpr std::array<int32_t, kNumValues> kValues = {1, 2, 3, 4};
+ int_writer->WriteBatch(kNumValues, kDefLevels.data(), kRepLevels.data(),
+ kValues.data());
+ file_writer->Close();
+ ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
+
+ // Read schema and verify it applies two-level encoding of list type
+ ASSERT_NO_FATAL_FAILURE(
+
VerifyData(ParquetFileReader::Open(std::make_shared<BufferReader>(buffer))));
+ }
+
+ // Interoperability test for Parquet file generated by parquet-java
Review Comment:
They have the same content so we can share the verify process in the same
test case.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]