Repository: parquet-cpp Updated Branches: refs/heads/master 21ad2c397 -> b89cbad30
PARQUET-918: Keep ordering in column indices when converting Parquet Schema This is a follow up fix for [PARQUET-918](https://github.com/apache/parquet-cpp/pull/295), do I need to create another jira for this? Looks like some .idea files are included by accident. It looks no harm. Do I need to revert them?@wesm cc @wesm @itaiin for reviewing Author: Xianjin YE <[email protected]> Closes #297 from advancedxy/master and squashes the following commits: e606d9d [Xianjin YE] Add .idea/ to .gitignore and make style check happy. 1adb192 [Xianjin YE] Add API doc for FromParquetSchema(parquet_schema, column_indices, out) 8de263b [Xianjin YE] Keep ordering in column indices when converting Parquet Schema to Arrow Schema Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/b89cbad3 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/b89cbad3 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/b89cbad3 Branch: refs/heads/master Commit: b89cbad30b699ec0b2cb23271f898ca89670f192 Parents: 21ad2c3 Author: Xianjin YE <[email protected]> Authored: Fri Apr 14 15:46:30 2017 -0400 Committer: Wes McKinney <[email protected]> Committed: Fri Apr 14 15:46:30 2017 -0400 ---------------------------------------------------------------------- .gitignore | 1 + src/parquet/arrow/arrow-schema-test.cc | 52 +++++++++++++++++++++++++++-- src/parquet/arrow/schema.cc | 13 +++++--- src/parquet/arrow/schema.h | 7 ++++ 4 files changed, 67 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b89cbad3/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index aeb80e1..9de56ea 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ Makefile thirdparty *.pc +.idea/ \ No newline at end of file http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b89cbad3/src/parquet/arrow/arrow-schema-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/arrow-schema-test.cc b/src/parquet/arrow/arrow-schema-test.cc index 85578ac..0f6b455 100644 --- a/src/parquet/arrow/arrow-schema-test.cc +++ b/src/parquet/arrow/arrow-schema-test.cc @@ -62,8 +62,8 @@ class TestConvertParquetSchema : public ::testing::Test { for (int i = 0; i < expected_schema->num_fields(); ++i) { auto lhs = result_schema_->field(i); auto rhs = expected_schema->field(i); - EXPECT_TRUE(lhs->Equals(rhs)) << i << " " << lhs->ToString() - << " != " << rhs->ToString(); + EXPECT_TRUE(lhs->Equals(rhs)) + << i << " " << lhs->ToString() << " != " << rhs->ToString(); } } @@ -433,6 +433,54 @@ TEST_F(TestConvertParquetSchema, ParquetNestedSchemaPartial) { CheckFlatSchema(arrow_schema); } +TEST_F(TestConvertParquetSchema, ParquetNestedSchemaPartialOrdering) { + std::vector<NodePtr> parquet_fields; + std::vector<std::shared_ptr<Field>> arrow_fields; + + // Full Parquet Schema: + // required group group1 { + // required int64 leaf1; + // required int64 leaf2; + // } + // required group group2 { + // required int64 leaf3; + // required int64 leaf4; + // } + // required int64 leaf5; + // + // Expected partial arrow schema (columns 3, 4, 0): + // required group group2 { + // required int64 leaf4; + // } + // required int64 leaf5; + // required group group1 { + // required int64 leaf1; + // } + { + parquet_fields.push_back(GroupNode::Make("group1", Repetition::REQUIRED, + {PrimitiveNode::Make("leaf1", Repetition::REQUIRED, ParquetType::INT64), + PrimitiveNode::Make("leaf2", Repetition::REQUIRED, ParquetType::INT64)})); + parquet_fields.push_back(GroupNode::Make("group2", Repetition::REQUIRED, + {PrimitiveNode::Make("leaf3", Repetition::REQUIRED, ParquetType::INT64), + PrimitiveNode::Make("leaf4", Repetition::REQUIRED, ParquetType::INT64)})); + parquet_fields.push_back( + PrimitiveNode::Make("leaf5", Repetition::REQUIRED, ParquetType::INT64)); + + auto group1_fields = {std::make_shared<Field>("leaf1", INT64, false)}; + auto arrow_group1_type = std::make_shared<::arrow::StructType>(group1_fields); + auto group2_fields = {std::make_shared<Field>("leaf4", INT64, false)}; + auto arrow_group2_type = std::make_shared<::arrow::StructType>(group2_fields); + + arrow_fields.push_back(std::make_shared<Field>("group2", arrow_group2_type, false)); + arrow_fields.push_back(std::make_shared<Field>("leaf5", INT64, false)); + arrow_fields.push_back(std::make_shared<Field>("group1", arrow_group1_type, false)); + } + + auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields); + ASSERT_OK(ConvertSchema(parquet_fields, {3, 4, 0})); + + CheckFlatSchema(arrow_schema); +} TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) { std::vector<NodePtr> parquet_fields; std::vector<std::shared_ptr<Field>> arrow_fields; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b89cbad3/src/parquet/arrow/schema.cc ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/schema.cc b/src/parquet/arrow/schema.cc index 2c74839..25713a7 100644 --- a/src/parquet/arrow/schema.cc +++ b/src/parquet/arrow/schema.cc @@ -330,21 +330,26 @@ Status FromParquetSchema(const SchemaDescriptor* parquet_schema, const std::vector<int>& column_indices, std::shared_ptr<::arrow::Schema>* out) { // TODO(wesm): Consider adding an arrow::Schema name attribute, which comes // from the root Parquet node - const GroupNode* schema_node = parquet_schema->group_node(); // Put the right leaf nodes in an unordered set + // Index in column_indices should be unique, duplicate indices are merged into one and + // ordering by its first appearing. int num_columns = static_cast<int>(column_indices.size()); + std::unordered_set<NodePtr> top_nodes; // to deduplicate the top nodes + std::vector<NodePtr> base_nodes; // to keep the ordering std::unordered_set<NodePtr> included_leaf_nodes(num_columns); for (int i = 0; i < num_columns; i++) { auto column_desc = parquet_schema->Column(column_indices[i]); included_leaf_nodes.insert(column_desc->schema_node()); + auto column_root = parquet_schema->GetColumnRoot(column_indices[i]); + auto insertion = top_nodes.insert(column_root); + if (insertion.second) { base_nodes.push_back(column_root); } } std::vector<std::shared_ptr<Field>> fields; std::shared_ptr<Field> field; - for (int i = 0; i < schema_node->field_count(); i++) { - RETURN_NOT_OK( - NodeToFieldInternal(schema_node->field(i), &included_leaf_nodes, &field)); + for (auto node : base_nodes) { + RETURN_NOT_OK(NodeToFieldInternal(node, &included_leaf_nodes, &field)); if (field != nullptr) { fields.push_back(field); } } http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b89cbad3/src/parquet/arrow/schema.h ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/schema.h b/src/parquet/arrow/schema.h index b93f088..1866fea 100644 --- a/src/parquet/arrow/schema.h +++ b/src/parquet/arrow/schema.h @@ -39,6 +39,13 @@ namespace arrow { ::arrow::Status PARQUET_EXPORT NodeToField( const schema::NodePtr& node, std::shared_ptr<::arrow::Field>* out); +/// Convert parquet schema to arrow schema with selected indices +/// \param parquet_schema to be converted +/// \param column_indices indices of leaf nodes in parquet schema tree. Appearing ordering +/// matters for the converted schema. Repeated indices are ignored +/// except for the first one +/// \param out the corresponding arrow schema +/// \return Status::OK() on a successful conversion. ::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema, const std::vector<int>& column_indices, std::shared_ptr<::arrow::Schema>* out);
