Repository: parquet-cpp Updated Branches: refs/heads/master 378f335c1 -> 52d36960e
PARQUET-809: Add SchemaDescriptor::Equals method To make it simpler to compare file metadata Author: Wes McKinney <[email protected]> Closes #214 from wesm/PARQUET-809 and squashes the following commits: 691e5bc [Wes McKinney] Add SchemaDescriptor::Equals method Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/52d36960 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/52d36960 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/52d36960 Branch: refs/heads/master Commit: 52d36960ef46a497089bd35b73eada8a689ab6d9 Parents: 378f335 Author: Wes McKinney <[email protected]> Authored: Thu Jan 5 12:21:05 2017 -0500 Committer: Wes McKinney <[email protected]> Committed: Thu Jan 5 12:21:05 2017 -0500 ---------------------------------------------------------------------- src/parquet/schema/descriptor.cc | 20 +++++++++ src/parquet/schema/descriptor.h | 4 ++ src/parquet/schema/schema-descriptor-test.cc | 54 +++++++++++++++++++++++ 3 files changed, 78 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/52d36960/src/parquet/schema/descriptor.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc index 4d46204..c5250d1 100644 --- a/src/parquet/schema/descriptor.cc +++ b/src/parquet/schema/descriptor.cc @@ -47,6 +47,20 @@ void SchemaDescriptor::Init(const NodePtr& schema) { } } +bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const { + if (this->num_columns() != other.num_columns()) { + return false; + } + + for (int i = 0; i < this->num_columns(); ++i) { + if (!this->Column(i)->Equals(*other.Column(i))) { + return false; + } + } + + return true; +} + void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level, int16_t max_rep_level, const NodePtr& base) { if (node->is_optional()) { @@ -82,6 +96,12 @@ ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node, primitive_node_ = static_cast<const PrimitiveNode*>(node_.get()); } +bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const { + return primitive_node_->Equals(other.primitive_node_) && + max_repetition_level() == other.max_repetition_level() && + max_definition_level() == other.max_definition_level(); +} + const ColumnDescriptor* SchemaDescriptor::Column(int i) const { DCHECK(i >= 0 && i < static_cast<int>(leaves_.size())); return &leaves_[i]; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/52d36960/src/parquet/schema/descriptor.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h index 1673d5f..ae7b60e 100644 --- a/src/parquet/schema/descriptor.h +++ b/src/parquet/schema/descriptor.h @@ -42,6 +42,8 @@ class PARQUET_EXPORT ColumnDescriptor { ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level, int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr); + bool Equals(const ColumnDescriptor& other) const; + int16_t max_definition_level() const { return max_definition_level_; } int16_t max_repetition_level() const { return max_repetition_level_; } @@ -97,6 +99,8 @@ class PARQUET_EXPORT SchemaDescriptor { const ColumnDescriptor* Column(int i) const; + bool Equals(const SchemaDescriptor& other) const; + // The number of physical columns appearing in the file int num_columns() const { return leaves_.size(); } http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/52d36960/src/parquet/schema/schema-descriptor-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc index eeaec5b..467d63c 100644 --- a/src/parquet/schema/schema-descriptor-test.cc +++ b/src/parquet/schema/schema-descriptor-test.cc @@ -71,6 +71,60 @@ TEST_F(TestSchemaDescriptor, InitNonGroup) { ASSERT_THROW(descr_.Init(node), ParquetException); } +TEST_F(TestSchemaDescriptor, Equals) { + NodePtr schema; + + NodePtr inta = Int32("a", Repetition::REQUIRED); + NodePtr intb = Int64("b", Repetition::OPTIONAL); + NodePtr intb2 = Int64("b2", Repetition::OPTIONAL); + NodePtr intc = ByteArray("c", Repetition::REPEATED); + + NodePtr item1 = Int64("item1", Repetition::REQUIRED); + NodePtr item2 = Boolean("item2", Repetition::OPTIONAL); + NodePtr item3 = Int32("item3", Repetition::REPEATED); + NodePtr list(GroupNode::Make( + "records", Repetition::REPEATED, {item1, item2, item3}, LogicalType::LIST)); + + NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list})); + NodePtr bag2(GroupNode::Make("bag", Repetition::REQUIRED, {list})); + + SchemaDescriptor descr1; + descr1.Init(GroupNode::Make("schema", Repetition::REPEATED, + {inta, intb, intc, bag})); + + ASSERT_TRUE(descr1.Equals(descr1)); + + SchemaDescriptor descr2; + descr2.Init(GroupNode::Make("schema", Repetition::REPEATED, + {inta, intb, intc, bag2})); + ASSERT_FALSE(descr1.Equals(descr2)); + + SchemaDescriptor descr3; + descr3.Init(GroupNode::Make("schema", Repetition::REPEATED, + {inta, intb2, intc, bag})); + ASSERT_FALSE(descr1.Equals(descr3)); + + // Robust to name of parent node + SchemaDescriptor descr4; + descr4.Init(GroupNode::Make("SCHEMA", Repetition::REPEATED, + {inta, intb, intc, bag})); + ASSERT_TRUE(descr1.Equals(descr4)); + + SchemaDescriptor descr5; + descr5.Init(GroupNode::Make("schema", Repetition::REPEATED, + {inta, intb, intc, bag, intb2})); + ASSERT_FALSE(descr1.Equals(descr5)); + + // Different max repetition / definition levels + ColumnDescriptor col1(inta, 5, 1); + ColumnDescriptor col2(inta, 6, 1); + ColumnDescriptor col3(inta, 5, 2); + + ASSERT_TRUE(col1.Equals(col1)); + ASSERT_FALSE(col1.Equals(col2)); + ASSERT_FALSE(col1.Equals(col3)); +} + TEST_F(TestSchemaDescriptor, BuildTree) { NodeVector fields; NodePtr schema;
