emkornfield commented on a change in pull request #7973:
URL: https://github.com/apache/arrow/pull/7973#discussion_r474409559
##########
File path: cpp/src/parquet/arrow/schema.cc
##########
@@ -410,21 +410,66 @@ ::arrow::Result<std::shared_ptr<ArrowType>>
GetTypeForNode(
return storage_type;
}
-Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t
max_rep_level,
+struct LevelInfo {
+ int16_t def_level = 0;
+ int16_t rep_level = 0;
+ int16_t repeated_ancestor_def_level = 0;
Review comment:
consolidated into 1 in level_conversions where ina follow-up PR will use
it as an argument to the levels there.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
Review comment:
the latter. added comment.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
Review comment:
yes.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
Review comment:
yes.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ ConvertedType::LIST)}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/5, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f0 bool
field
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/1));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f1 bool
field
+
+ // Legacy 2-level necoding
Review comment:
Non-legacy is three level encoding above.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ ConvertedType::LIST)}));
Review comment:
Yes. The recommended standard to list types is 3 level encoding. Which
has an outer group indicating nullability of the "list" field. A repeated
child-group to indicate its repeatedness and an inner group to indicate
nullabiity of elements. Reference:
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ ConvertedType::LIST)}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/5, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f0 bool
field
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/1));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f1 bool
field
+
+ // Legacy 2-level necoding
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST)}));
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // inner
struct field
Review comment:
yes. fixed.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ ConvertedType::LIST)}));
Review comment:
ConvertedType is legacy, correct, this was copy-pasta and should be
fixed now.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ ConvertedType::LIST)}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/5, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f0 bool
field
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/1));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f1 bool
field
+
+ // Legacy 2-level necoding
Review comment:
Legacy because it isn't recommended. Gave a pointer to
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types
##########
File path: cpp/src/parquet/arrow/schema.cc
##########
@@ -606,23 +652,27 @@ Status NodeToSchemaField(const Node& node, int16_t
current_def_level,
if (node.is_repeated()) {
// One-level list encoding, e.g.
// a: repeated int32;
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
out->children.resize(1);
auto child_field = ::arrow::field(node.name(), type, /*nullable=*/false);
- RETURN_NOT_OK(PopulateLeaf(column_index, child_field, current_def_level,
- current_rep_level, ctx, out,
&out->children[0]));
+ RETURN_NOT_OK(PopulateLeaf(column_index, child_field, current_levels,
ctx, out,
+ &out->children[0]));
out->field = ::arrow::field(node.name(), ::arrow::list(child_field),
/*nullable=*/false,
FieldIdMetadata(node.field_id()));
// Is this right?
Review comment:
probably not but I found it entertaining. removed.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ ConvertedType::LIST)}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/5, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f0 bool
field
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/1));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f1 bool
field
+
+ // Legacy 2-level necoding
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST)}));
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // inner
struct field
+}
+
+TEST_F(TestLevels, ListErrors) {
+ {
+ ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make(
+ "child_list", Repetition::REPEATED,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST));
+ EXPECT_TRUE(error.IsInvalid());
+ std::string expected("LIST-annotated groups must not be repeated.");
+ EXPECT_EQ(error.message().substr(0, expected.size()), expected);
Review comment:
I was originally using testing::HasSubstr but it had linking issues on
windows (I opened a JIRA for this) and didn't want to spend the time
investigating. I think it might be the same with StartsWith.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ ConvertedType::LIST)}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/5, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f0 bool
field
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/1));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f1 bool
field
+
+ // Legacy 2-level necoding
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST)}));
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // inner
struct field
+}
+
+TEST_F(TestLevels, ListErrors) {
+ {
+ ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make(
+ "child_list", Repetition::REPEATED,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST));
+ EXPECT_TRUE(error.IsInvalid());
Review comment:
done.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
Review comment:
updated with a comment.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ ConvertedType::LIST)}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
Review comment:
Tried to rephrase. Def level 2 and def level 3 are use to discriminate
for the same field `child_list` which is a `nullable list`. When decoding
`def_level=2` indicates a non-null but empty list. `def_level==3` indicates an
element present in the list.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ ConvertedType::LIST)}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/5, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f0 bool
field
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/1));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f1 bool
field
+
+ // Legacy 2-level necoding
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST)}));
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // inner
struct field
+}
+
+TEST_F(TestLevels, ListErrors) {
+ {
+ ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make(
+ "child_list", Repetition::REPEATED,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST));
+ EXPECT_TRUE(error.IsInvalid());
+ std::string expected("LIST-annotated groups must not be repeated.");
+ EXPECT_EQ(error.message().substr(0, expected.size()), expected);
+ }
Review comment:
From the spec:
> The outer-most level must be a group annotated with LIST that contains a
single field named list. The repetition of this level must be either optional
or required and determines whether the list is nullable.
I read that as it should never be repeated (I didn't see any exceptions
under the 2-level encoding but maybe we should clarify on parquet-dev@)?
##########
File path: cpp/src/parquet/arrow/schema.cc
##########
@@ -410,21 +410,66 @@ ::arrow::Result<std::shared_ptr<ArrowType>>
GetTypeForNode(
return storage_type;
}
-Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t
max_rep_level,
+struct LevelInfo {
+ int16_t def_level = 0;
+ int16_t rep_level = 0;
+ int16_t repeated_ancestor_def_level = 0;
+
+ /// Copies current levels to the schema field.
+ void Populate(SchemaField* out) {
+ out->definition_level = def_level;
+ out->repetition_level = rep_level;
+ out->repeated_ancestor_definition_level = repeated_ancestor_def_level;
+ }
+
+ /// Increments levels according to the cardinality of node.
+ void Increment(const Node& node) {
+ if (node.is_repeated()) {
+ IncrementRepeated();
+ return;
+ }
+ if (node.is_optional()) {
+ IncrementOptional();
+ return;
+ }
+ }
+
+ /// Incremetns level for a optional node.
+ void IncrementOptional() { def_level++; }
+
+ /// Increments levels for the repeated node. Returns
+ /// the previous ancestor_list_def_level.
+ int16_t IncrementRepeated() {
+ int16_t last_repeated_ancestor = repeated_ancestor_def_level;
+
+ // Repeated fields add both a repetition and definition level. This is used
+ // to distinguish between an empty list and a list with an item in it.
+ ++rep_level;
+ ++def_level;
+ // For levels >= current_def_level it indicates the list was
Review comment:
should be repeated_ancestor_def_level.
##########
File path: cpp/src/parquet/arrow/schema.cc
##########
@@ -554,41 +605,36 @@ Status GroupToSchemaField(const GroupNode& node, int16_t
current_def_level,
// repeated group $NAME {
// r/o TYPE[0] f0
// r/o TYPE[1] f1
- // }
Review comment:
overzealous delete i think. added back.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]