pitrou commented on a change in pull request #7973: URL: https://github.com/apache/arrow/pull/7973#discussion_r474112279
########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); Review comment: This is `list(list(boolean))` in Arrow terms? Perhaps add comments as above? ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + Levels{/*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {GroupNode::Make( + "list", Repetition::REPEATED, + {GroupNode::Make( + "element", Repetition::OPTIONAL, + {PrimitiveNode::Make("f0", Repetition::OPTIONAL, ParquetType::BOOLEAN), + PrimitiveNode::Make("f1", Repetition::REQUIRED, + ParquetType::BOOLEAN)})})}, + ConvertedType::LIST)})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. Review comment: Hmm... what is a null list? What does it mean to have a def level that doesn't map to anything? ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + Levels{/*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field Review comment: This would be `list(struct(child: list(boolean)))`? ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); Review comment: Is it `struct(child: struct(inner: boolean)) not null`? ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); Review comment: Is it `struct(child: struct(inner: boolean))`? ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + Levels{/*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {GroupNode::Make( + "list", Repetition::REPEATED, + {GroupNode::Make( + "element", Repetition::OPTIONAL, + {PrimitiveNode::Make("f0", Repetition::OPTIONAL, ParquetType::BOOLEAN), + PrimitiveNode::Make("f1", Repetition::REQUIRED, + ParquetType::BOOLEAN)})})}, + ConvertedType::LIST)})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/5, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f0 bool field + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/1)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f1 bool field + + // Legacy 2-level necoding + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)}, + ConvertedType::LIST)})); + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // inner struct field +} + +TEST_F(TestLevels, ListErrors) { + { + ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make( + "child_list", Repetition::REPEATED, + {PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)}, + ConvertedType::LIST)); + EXPECT_TRUE(error.IsInvalid()); Review comment: `ASSERT_RAISES(Invalid, error)` ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + Levels{/*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {GroupNode::Make( + "list", Repetition::REPEATED, + {GroupNode::Make( + "element", Repetition::OPTIONAL, + {PrimitiveNode::Make("f0", Repetition::OPTIONAL, ParquetType::BOOLEAN), + PrimitiveNode::Make("f1", Repetition::REQUIRED, + ParquetType::BOOLEAN)})})}, + ConvertedType::LIST)})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/5, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f0 bool field + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/1)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f1 bool field + + // Legacy 2-level necoding + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)}, + ConvertedType::LIST)})); + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // inner struct field +} + +TEST_F(TestLevels, ListErrors) { + { + ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make( + "child_list", Repetition::REPEATED, + {PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)}, + ConvertedType::LIST)); + EXPECT_TRUE(error.IsInvalid()); + std::string expected("LIST-annotated groups must not be repeated."); + EXPECT_EQ(error.message().substr(0, expected.size()), expected); + } Review comment: Why is it an error to have a repeated LIST-annotated group? ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field Review comment: Do we want to test that it's actually mapped to an Arrow `list(boolean)`? Or is that done elsewhere in the tests? (or is it `list(boolean not null)`?) ########## File path: cpp/src/parquet/arrow/schema.cc ########## @@ -410,21 +410,66 @@ ::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode( return storage_type; } -Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_rep_level, +struct LevelInfo { + int16_t def_level = 0; + int16_t rep_level = 0; + int16_t repeated_ancestor_def_level = 0; + + /// Copies current levels to the schema field. + void Populate(SchemaField* out) { + out->definition_level = def_level; + out->repetition_level = rep_level; + out->repeated_ancestor_definition_level = repeated_ancestor_def_level; + } + + /// Increments levels according to the cardinality of node. + void Increment(const Node& node) { + if (node.is_repeated()) { + IncrementRepeated(); + return; + } + if (node.is_optional()) { + IncrementOptional(); + return; + } + } + + /// Incremetns level for a optional node. + void IncrementOptional() { def_level++; } + + /// Increments levels for the repeated node. Returns + /// the previous ancestor_list_def_level. + int16_t IncrementRepeated() { + int16_t last_repeated_ancestor = repeated_ancestor_def_level; + + // Repeated fields add both a repetition and definition level. This is used + // to distinguish between an empty list and a list with an item in it. + ++rep_level; + ++def_level; + // For levels >= current_def_level it indicates the list was Review comment: What is "current_def_level"? ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + Levels{/*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {GroupNode::Make( + "list", Repetition::REPEATED, + {GroupNode::Make( + "element", Repetition::OPTIONAL, + {PrimitiveNode::Make("f0", Repetition::OPTIONAL, ParquetType::BOOLEAN), + PrimitiveNode::Make("f1", Repetition::REQUIRED, + ParquetType::BOOLEAN)})})}, + ConvertedType::LIST)})); Review comment: Hmm... so the `ConvertedType::LIST` says that `child_list` is semantically a list even though it cannot be repeated, right? Is there a reason we don't use `LogicalType` instead? (AFAIU, `ConvertedType` is legacy?) ########## File path: cpp/src/parquet/arrow/schema.cc ########## @@ -606,23 +652,27 @@ Status NodeToSchemaField(const Node& node, int16_t current_def_level, if (node.is_repeated()) { // One-level list encoding, e.g. // a: repeated int32; + int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated(); out->children.resize(1); auto child_field = ::arrow::field(node.name(), type, /*nullable=*/false); - RETURN_NOT_OK(PopulateLeaf(column_index, child_field, current_def_level, - current_rep_level, ctx, out, &out->children[0])); + RETURN_NOT_OK(PopulateLeaf(column_index, child_field, current_levels, ctx, out, + &out->children[0])); out->field = ::arrow::field(node.name(), ::arrow::list(child_field), /*nullable=*/false, FieldIdMetadata(node.field_id())); // Is this right? Review comment: Do we need to keep this comment? ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + Levels{/*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {GroupNode::Make( + "list", Repetition::REPEATED, + {GroupNode::Make( + "element", Repetition::OPTIONAL, + {PrimitiveNode::Make("f0", Repetition::OPTIONAL, ParquetType::BOOLEAN), + PrimitiveNode::Make("f1", Repetition::REQUIRED, + ParquetType::BOOLEAN)})})}, + ConvertedType::LIST)})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/5, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f0 bool field + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/1)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f1 bool field + + // Legacy 2-level necoding + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)}, + ConvertedType::LIST)})); + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // inner struct field Review comment: You mean "bool field"? ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + Levels{/*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {GroupNode::Make( + "list", Repetition::REPEATED, + {GroupNode::Make( + "element", Repetition::OPTIONAL, + {PrimitiveNode::Make("f0", Repetition::OPTIONAL, ParquetType::BOOLEAN), + PrimitiveNode::Make("f1", Repetition::REQUIRED, + ParquetType::BOOLEAN)})})}, + ConvertedType::LIST)})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/5, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f0 bool field + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/1)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f1 bool field + + // Legacy 2-level necoding Review comment: Can you elaborate why it's legacy? What would be the non-legacy encoding? ########## File path: cpp/src/parquet/arrow/schema.cc ########## @@ -554,41 +605,36 @@ Status GroupToSchemaField(const GroupNode& node, int16_t current_def_level, // repeated group $NAME { // r/o TYPE[0] f0 // r/o TYPE[1] f1 - // } Review comment: Why? ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + Levels{/*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {GroupNode::Make( + "list", Repetition::REPEATED, + {GroupNode::Make( + "element", Repetition::OPTIONAL, + {PrimitiveNode::Make("f0", Repetition::OPTIONAL, ParquetType::BOOLEAN), + PrimitiveNode::Make("f1", Repetition::REQUIRED, + ParquetType::BOOLEAN)})})}, + ConvertedType::LIST)})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/5, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f0 bool field + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/1)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f1 bool field + + // Legacy 2-level necoding + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)}, + ConvertedType::LIST)})); + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // inner struct field +} + +TEST_F(TestLevels, ListErrors) { + { + ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make( + "child_list", Repetition::REPEATED, + {PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)}, + ConvertedType::LIST)); + EXPECT_TRUE(error.IsInvalid()); + std::string expected("LIST-annotated groups must not be repeated."); + EXPECT_EQ(error.message().substr(0, expected.size()), expected); Review comment: Or something like: ```c++ EXPECT_THAT( error.message(), testing::StartsWith( "LIST-annotated groups must not be repeated")); ``` ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); Review comment: Or is it `struct(child: struct(inner: boolean not null))`? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org