emkornfield commented on a change in pull request #7973: URL: https://github.com/apache/arrow/pull/7973#discussion_r478100513
########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1144,244 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +::arrow::Result<std::deque<LevelInfo>> RootToTreeLeafLevels( + const SchemaManifest& manifest, int column_number) { + std::deque<LevelInfo> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front(field->level_info); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1, + /*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, + /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: list(bool not null) not null + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + // Arrow schema: struct(child: struct(inner: boolean not null)) + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: struct(child: struct(inner: boolean )) + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: struct(child: struct(inner: boolean)) not null + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + // Arrow schema: list(struct(child: struct(list(bool not null) not null)) non null) not + // null + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field Review comment: done ########## File path: cpp/src/parquet/level_conversion.h ########## @@ -20,10 +20,117 @@ #include <cstdint> #include "parquet/platform.h" +#include "parquet/schema.h" namespace parquet { namespace internal { +struct PARQUET_EXPORT LevelInfo { + LevelInfo() + : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {} + LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repitition_level, + int32_t repeated_ancestor_definition_level) + : null_slot_usage(null_slots), + def_level(definition_level), + rep_level(repitition_level), + repeated_ancestor_def_level(repeated_ancestor_definition_level) {} + + bool operator==(const LevelInfo& b) const { + return null_slot_usage == b.null_slot_usage && def_level == b.def_level && + rep_level == b.rep_level && + repeated_ancestor_def_level == b.repeated_ancestor_def_level; + } + + // How many slots a null element consumes. + // This is only ever >1 for descendents of + // FixedSizeList. + int32_t null_slot_usage = 1; + + // The definition level at which the value for the field + // is considered not null (definition levels greater than + // or equal to indicate this value indicate a not-null + // value for the field). For list fields definition levels + // greater then or equal to this field indicate a present + // , possibly null, element. + int16_t def_level = 0; + + // The repetition level corresponding to this element + // or the closest repeated ancestor. Any repetition + // level less than this indicates either a new list OR + // an empty list (which is determined in conjunction + // definition_level). + int16_t rep_level = 0; + + // The definition level indicating the level at which the closest + // repeated ancestor was not empty. This is used to discriminate Review comment: I don't think I understand this question. since this mentions definition level this is really a parquet construct. ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1144,244 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +::arrow::Result<std::deque<LevelInfo>> RootToTreeLeafLevels( + const SchemaManifest& manifest, int column_number) { + std::deque<LevelInfo> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front(field->level_info); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1, + /*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, + /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: list(bool not null) not null + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + // Arrow schema: struct(child: struct(inner: boolean not null)) + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: struct(child: struct(inner: boolean )) + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: struct(child: struct(inner: boolean)) not null + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + // Arrow schema: list(struct(child: struct(list(bool not null) not null)) non null) not + // null + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field + + // Arrow schema: list(struct(child_list: list(struct(f0: bool f1: bool no-required ))) + // not null) not null + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {GroupNode::Make( + "list", Repetition::REPEATED, + {GroupNode::Make( + "element", Repetition::OPTIONAL, + {PrimitiveNode::Make("f0", Repetition::OPTIONAL, ParquetType::BOOLEAN), + PrimitiveNode::Make("f1", Repetition::REQUIRED, + ParquetType::BOOLEAN)})})}, + LogicalType::List())})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_level=2 is handled together with def_level=3 + // When decoding. Def_level=2 indicate present but empty + // list. def_level=3 indicates a present element in the + // list. + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/5, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f0 bool field Review comment: I agree, nice catch, there was in fact a bug in the linkages. ########## File path: cpp/src/parquet/level_conversion.h ########## @@ -20,10 +20,117 @@ #include <cstdint> #include "parquet/platform.h" +#include "parquet/schema.h" namespace parquet { namespace internal { +struct PARQUET_EXPORT LevelInfo { + LevelInfo() + : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {} + LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repitition_level, Review comment: fixed. ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + Levels{/*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {GroupNode::Make( + "list", Repetition::REPEATED, + {GroupNode::Make( + "element", Repetition::OPTIONAL, + {PrimitiveNode::Make("f0", Repetition::OPTIONAL, ParquetType::BOOLEAN), + PrimitiveNode::Make("f1", Repetition::REQUIRED, + ParquetType::BOOLEAN)})})}, + ConvertedType::LIST)})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/5, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f0 bool field + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/1)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f1 bool field + + // Legacy 2-level necoding + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)}, + ConvertedType::LIST)})); + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // inner struct field +} + +TEST_F(TestLevels, ListErrors) { + { + ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make( + "child_list", Repetition::REPEATED, + {PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)}, + ConvertedType::LIST)); + EXPECT_TRUE(error.IsInvalid()); + std::string expected("LIST-annotated groups must not be repeated."); + EXPECT_EQ(error.message().substr(0, expected.size()), expected); + } Review comment: Yes it should be named "list" unseen I think references that isn't modelled on the arrow side. ########## File path: cpp/src/parquet/level_conversion.h ########## @@ -20,10 +20,117 @@ #include <cstdint> #include "parquet/platform.h" +#include "parquet/schema.h" namespace parquet { namespace internal { +struct PARQUET_EXPORT LevelInfo { + LevelInfo() + : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {} + LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repitition_level, + int32_t repeated_ancestor_definition_level) + : null_slot_usage(null_slots), + def_level(definition_level), + rep_level(repitition_level), + repeated_ancestor_def_level(repeated_ancestor_definition_level) {} + + bool operator==(const LevelInfo& b) const { + return null_slot_usage == b.null_slot_usage && def_level == b.def_level && + rep_level == b.rep_level && + repeated_ancestor_def_level == b.repeated_ancestor_def_level; + } + + // How many slots a null element consumes. + // This is only ever >1 for descendents of + // FixedSizeList. + int32_t null_slot_usage = 1; + + // The definition level at which the value for the field + // is considered not null (definition levels greater than + // or equal to indicate this value indicate a not-null + // value for the field). For list fields definition levels + // greater then or equal to this field indicate a present + // , possibly null, element. + int16_t def_level = 0; + + // The repetition level corresponding to this element + // or the closest repeated ancestor. Any repetition + // level less than this indicates either a new list OR + // an empty list (which is determined in conjunction + // definition_level). Review comment: definition levels in general. We only really care about where repetition level increases in which case the def_level of node should be sufficient to determine this. ########## File path: cpp/src/parquet/level_conversion.h ########## @@ -20,10 +20,117 @@ #include <cstdint> #include "parquet/platform.h" +#include "parquet/schema.h" namespace parquet { namespace internal { +struct PARQUET_EXPORT LevelInfo { + LevelInfo() + : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {} + LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repitition_level, + int32_t repeated_ancestor_definition_level) + : null_slot_usage(null_slots), + def_level(definition_level), + rep_level(repitition_level), + repeated_ancestor_def_level(repeated_ancestor_definition_level) {} + + bool operator==(const LevelInfo& b) const { + return null_slot_usage == b.null_slot_usage && def_level == b.def_level && + rep_level == b.rep_level && + repeated_ancestor_def_level == b.repeated_ancestor_def_level; + } + + // How many slots a null element consumes. + // This is only ever >1 for descendents of + // FixedSizeList. + int32_t null_slot_usage = 1; + + // The definition level at which the value for the field + // is considered not null (definition levels greater than + // or equal to indicate this value indicate a not-null Review comment: yes. fixed. ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +struct Levels { + int16_t def_level; + int16_t rep_level; + int16_t repeated_ancestor_def; + friend std::ostream& operator<<(std::ostream& os, const Levels& levels) { + // This print method is to silence valgrind issues. What's printed + // is not important because all asserts happen directly on + // members. + os << "{def=" << levels.def_level << ", rep=" << levels.rep_level + << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}"; + return os; + } +}; + +bool operator==(const Levels& a, const Levels& b) { + return a.def_level == b.def_level && a.rep_level == b.rep_level && + a.repeated_ancestor_def == b.repeated_ancestor_def; +} + +::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest& manifest, + int column_number) { + std::deque<Levels> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front({field->definition_level, field->repetition_level, + field->repeated_ancestor_definition_level}); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + Levels{/*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + Levels{/*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field + + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {GroupNode::Make( + "list", Repetition::REPEATED, + {GroupNode::Make( + "element", Repetition::OPTIONAL, + {PrimitiveNode::Make("f0", Repetition::OPTIONAL, ParquetType::BOOLEAN), + PrimitiveNode::Make("f1", Repetition::REQUIRED, + ParquetType::BOOLEAN)})})}, + ConvertedType::LIST)})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/5, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f0 bool field + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/1)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + Levels{/*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f1 bool field + + // Legacy 2-level necoding + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)}, + ConvertedType::LIST)})); + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, + ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_ldevl=2 is skipped because it represents a null list. + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + Levels{/*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // inner struct field +} + +TEST_F(TestLevels, ListErrors) { + { + ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make( + "child_list", Repetition::REPEATED, + {PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)}, + ConvertedType::LIST)); + EXPECT_TRUE(error.IsInvalid()); + std::string expected("LIST-annotated groups must not be repeated."); + EXPECT_EQ(error.message().substr(0, expected.size()), expected); + } Review comment: I think you are right this documention is off. I think I mostly copy and pasted from existing code. ########## File path: cpp/src/parquet/arrow/schema.cc ########## @@ -477,12 +484,11 @@ Status ListToSchemaField(const GroupNode& group, int16_t current_def_level, const Node& list_node = *group.field(0); if (!list_node.is_repeated()) { - return Status::NotImplemented( + return Status::Invalid( "Non-repeated nodes in a LIST-annotated group are not supported."); } - ++current_def_level; - ++current_rep_level; + int16_t repeated_ancesor_def_level = current_levels.IncrementRepeated(); Review comment: yes. fixed. ########## File path: cpp/src/parquet/level_conversion.h ########## @@ -20,10 +20,117 @@ #include <cstdint> #include "parquet/platform.h" +#include "parquet/schema.h" namespace parquet { namespace internal { +struct PARQUET_EXPORT LevelInfo { + LevelInfo() + : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {} + LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repitition_level, + int32_t repeated_ancestor_definition_level) + : null_slot_usage(null_slots), + def_level(definition_level), + rep_level(repitition_level), + repeated_ancestor_def_level(repeated_ancestor_definition_level) {} + + bool operator==(const LevelInfo& b) const { + return null_slot_usage == b.null_slot_usage && def_level == b.def_level && + rep_level == b.rep_level && + repeated_ancestor_def_level == b.repeated_ancestor_def_level; + } + + // How many slots a null element consumes. Review comment: not quite, tried to reword. This reflects when parquet has an undefined but present element (i.e. null in arrow) how many slots in arrow are used in the arrow array. For fixed size lists a null fixed size list forces all of its non nested children to have N null slots where N is the length of the fixed size list. Nested fixed size lists increases this number multiplicatively . ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1144,244 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +::arrow::Result<std::deque<LevelInfo>> RootToTreeLeafLevels( + const SchemaManifest& manifest, int column_number) { + std::deque<LevelInfo> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front(field->level_info); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1, + /*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, + /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: list(bool not null) not null + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + // Arrow schema: struct(child: struct(inner: boolean not null)) + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: struct(child: struct(inner: boolean )) + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: struct(child: struct(inner: boolean)) not null + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + // Arrow schema: list(struct(child: struct(list(bool not null) not null)) non null) not Review comment: I don't believe the outer struct is nullable. See comment below. ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1144,244 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +::arrow::Result<std::deque<LevelInfo>> RootToTreeLeafLevels( + const SchemaManifest& manifest, int column_number) { + std::deque<LevelInfo> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front(field->level_info); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1, + /*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, + /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: list(bool not null) not null + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + // Arrow schema: struct(child: struct(inner: boolean not null)) + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: struct(child: struct(inner: boolean )) + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: struct(child: struct(inner: boolean)) not null + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + // Arrow schema: list(struct(child: struct(list(bool not null) not null)) non null) not + // null + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field + + // Arrow schema: list(struct(child_list: list(struct(f0: bool f1: bool no-required ))) + // not null) not null + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child_list", Repetition::OPTIONAL, + {GroupNode::Make( + "list", Repetition::REPEATED, + {GroupNode::Make( + "element", Repetition::OPTIONAL, + {PrimitiveNode::Make("f0", Repetition::OPTIONAL, ParquetType::BOOLEAN), + PrimitiveNode::Make("f1", Repetition::REQUIRED, + ParquetType::BOOLEAN)})})}, + LogicalType::List())})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_level=2 is handled together with def_level=3 + // When decoding. Def_level=2 indicate present but empty + // list. def_level=3 indicates a present element in the + // list. + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/5, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f0 bool field + + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/1)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + // Def_level=2 is handled together with def_level=3 + // When decoding. Def_level=2 indicate present but empty + // list. def_level=3 indicates a present element in the + // list. + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // list field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3}, // inner struct field + + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/4, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // f1 bool field + + // Arrow schema: list(struct(child_list: list(bool not null)) not null) not null + // Legacy 2-level necoding (required for backwards compatibility. See Review comment: done. ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1144,244 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +::arrow::Result<std::deque<LevelInfo>> RootToTreeLeafLevels( + const SchemaManifest& manifest, int column_number) { + std::deque<LevelInfo> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front(field->level_info); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1, + /*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, + /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: list(bool not null) not null + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + // Arrow schema: struct(child: struct(inner: boolean not null)) + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: struct(child: struct(inner: boolean )) + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: struct(child: struct(inner: boolean)) not null + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { Review comment: Added a few more, let me know if you would like to see more. ########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1140,5 +1144,244 @@ TEST(TestFromParquetSchema, CorruptMetadata) { ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } +::arrow::Result<std::deque<LevelInfo>> RootToTreeLeafLevels( + const SchemaManifest& manifest, int column_number) { + std::deque<LevelInfo> out; + const SchemaField* field; + RETURN_NOT_OK(manifest.GetColumnField(column_number, &field)); + while (field != nullptr) { + out.push_front(field->level_info); + field = manifest.GetParent(field); + } + return out; +} + +class TestLevels : public ::testing::Test { + public: + virtual void SetUp() {} + + ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) { + descriptor_.reset(new SchemaDescriptor()); + manifest_.reset(new SchemaManifest()); + descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column})); + return SchemaManifest::Make(descriptor_.get(), + std::shared_ptr<const ::arrow::KeyValueMetadata>(), + ArrowReaderProperties(), manifest_.get()); + } + void SetParquetSchema(const NodePtr& column) { + ASSERT_OK(MaybeSetParquetSchema(column)); + } + + protected: + std::unique_ptr<SchemaDescriptor> descriptor_; + std::unique_ptr<SchemaManifest> manifest_; +}; + +TEST_F(TestLevels, TestPrimitive) { + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REQUIRED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1, + /*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::OPTIONAL, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, + /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: list(bool not null) not null + SetParquetSchema( + PrimitiveNode::Make("node_name", Repetition::REPEATED, ParquetType::BOOLEAN)); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 0}, // List Field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1})); // primitive field +} + +TEST_F(TestLevels, TestSimpleGroups) { + // Arrow schema: struct(child: struct(inner: boolean not null)) + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REQUIRED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: struct(child: struct(inner: boolean )) + SetParquetSchema(GroupNode::Make( + "parent", Repetition::OPTIONAL, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); + + // Arrow schema: struct(child: struct(inner: boolean)) not null + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REQUIRED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::OPTIONAL, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/0, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/0, + /*ancestor_list_def_level*/ 0})); +} + +TEST_F(TestLevels, TestRepeatedGroups) { + // Arrow schema: list(struct(child: struct(list(bool not null) not null)) non null) not + // null + SetParquetSchema(GroupNode::Make( + "parent", Repetition::REPEATED, + {GroupNode::Make( + "child", Repetition::OPTIONAL, + {PrimitiveNode::Make("inner", Repetition::REPEATED, ParquetType::BOOLEAN)})})); + ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels, + RootToTreeLeafLevels(*manifest_, /*column_number=*/0)); + EXPECT_THAT( + levels, + ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2, /*rep_level=*/1, + /*ancestor_list_def_level*/ 1}, // optional child struct + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 1}, // repeated field + LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3, /*rep_level=*/2, + /*ancestor_list_def_level*/ 3})); // innter field + + // Arrow schema: list(struct(child_list: list(struct(f0: bool f1: bool no-required ))) + // not null) not null Review comment: Hmm, I don't think the outer struct is nullable. "parent" ends up mapping to two arrow fields: list and struct. they share the same def level, so it there is an element in list struct must be present. But I could be misreading this. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org