Repository: arrow Updated Branches: refs/heads/master 0a8979d3a -> d3cb6b47f
ARROW-22: [C++] Convert flat Parquet schemas to Arrow schemas I'm going to limit the amount of nested data (especially repeated fields) cases in this patch as I haven't yet thought through the nested data reassembly from repetition / definition levels. Since the effective Arrow schemas may "collapse" multiple levels of nesting (for example: 3-level array encoding -- see https://github.com/apache/parquet-cpp/blob/master/src/parquet/schema/types.h), we'll need to track the logical correspondence between repetition and definition levels so that the right null bits can be set easily during reassembly. Closes #37. Closes #38. Closes #39 Author: Wes McKinney <w...@apache.org> Author: Uwe L. Korn <uw...@xhochy.com> Closes #41 from wesm/ARROW-22 and squashes the following commits: f388210 [Wes McKinney] Correct typo in Layout.md (thanks @takahirox) e5c429a [Wes McKinney] Test for some unsupported Parquet schema types, add unannotated FIXED_LEN_BYTE_ARRAY to List<UInt8> 54daa9b [Wes McKinney] Refactor tests to invoke FromParquetSchema 74d6bae [Wes McKinney] Convert BYTE_ARRAY to StringType or List<UInt8> depending on the logical type b7b9ca9 [Uwe L. Korn] Add basic conversion for primitive types 0e2a7f1 [Uwe L. Korn] Add macro for adding dependencies to tests 0dd1109 [Uwe L. Korn] ARROW-78: Add constructor for DecimalType Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/d3cb6b47 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/d3cb6b47 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/d3cb6b47 Branch: refs/heads/master Commit: d3cb6b47fde2935522b73c7150d83e364f4e19c9 Parents: 0a8979d Author: Wes McKinney <w...@apache.org> Authored: Sat Mar 26 17:07:40 2016 -0700 Committer: Wes McKinney <w...@apache.org> Committed: Sat Mar 26 17:07:40 2016 -0700 ---------------------------------------------------------------------- cpp/CMakeLists.txt | 11 ++ cpp/src/arrow/parquet/CMakeLists.txt | 8 +- cpp/src/arrow/parquet/parquet-schema-test.cc | 147 ++++++++++++++++++ cpp/src/arrow/parquet/schema.cc | 178 ++++++++++++++++++++++ cpp/src/arrow/parquet/schema.h | 44 ++++++ cpp/src/arrow/types/decimal.cc | 32 ++++ cpp/src/arrow/types/decimal.h | 11 ++ cpp/src/arrow/util/status.h | 1 + format/Layout.md | 2 +- 9 files changed, 432 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/d3cb6b47/cpp/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6d70107..6ed2768 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -378,6 +378,16 @@ function(ADD_ARROW_TEST_DEPENDENCIES REL_TEST_NAME) add_dependencies(${TEST_NAME} ${ARGN}) endfunction() +# A wrapper for target_link_libraries() that is compatible with NO_TESTS. +function(ARROW_TEST_LINK_LIBRARIES REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + target_link_libraries(${TEST_NAME} ${ARGN}) +endfunction() + enable_testing() ############################################################ @@ -528,6 +538,7 @@ set(ARROW_SRCS src/arrow/ipc/metadata-internal.cc src/arrow/types/construct.cc + src/arrow/types/decimal.cc src/arrow/types/json.cc src/arrow/types/list.cc src/arrow/types/primitive.cc http://git-wip-us.apache.org/repos/asf/arrow/blob/d3cb6b47/cpp/src/arrow/parquet/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index 7b449af..0d5cf26 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -19,17 +19,23 @@ # arrow_parquet : Arrow <-> Parquet adapter set(PARQUET_SRCS + schema.cc ) set(PARQUET_LIBS + arrow + ${PARQUET_SHARED_LIB} ) -add_library(arrow_parquet STATIC +add_library(arrow_parquet SHARED ${PARQUET_SRCS} ) target_link_libraries(arrow_parquet ${PARQUET_LIBS}) SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX) +ADD_ARROW_TEST(parquet-schema-test) +ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet) + # Headers: top level install(FILES DESTINATION include/arrow/parquet) http://git-wip-us.apache.org/repos/asf/arrow/blob/d3cb6b47/cpp/src/arrow/parquet/parquet-schema-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc new file mode 100644 index 0000000..9c3093d --- /dev/null +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <memory> +#include <vector> + +#include "gtest/gtest.h" + +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/util/status.h" + +#include "arrow/parquet/schema.h" + +namespace arrow { + +namespace parquet { + +using parquet_cpp::Repetition; +using parquet_cpp::schema::NodePtr; +using parquet_cpp::schema::GroupNode; +using parquet_cpp::schema::PrimitiveNode; + +const auto BOOL = std::make_shared<BooleanType>(); +const auto UINT8 = std::make_shared<UInt8Type>(); +const auto INT32 = std::make_shared<Int32Type>(); +const auto INT64 = std::make_shared<Int64Type>(); +const auto FLOAT = std::make_shared<FloatType>(); +const auto DOUBLE = std::make_shared<DoubleType>(); +const auto UTF8 = std::make_shared<StringType>(); +const auto BINARY = std::make_shared<ListType>( + std::make_shared<Field>("", UINT8)); + +class TestConvertParquetSchema : public ::testing::Test { + public: + virtual void SetUp() {} + + void CheckFlatSchema(const std::shared_ptr<Schema>& expected_schema) { + ASSERT_EQ(expected_schema->num_fields(), result_schema_->num_fields()); + for (int i = 0; i < expected_schema->num_fields(); ++i) { + auto lhs = result_schema_->field(i); + auto rhs = expected_schema->field(i); + EXPECT_TRUE(lhs->Equals(rhs)) + << i << " " << lhs->ToString() << " != " << rhs->ToString(); + } + } + + Status ConvertSchema(const std::vector<NodePtr>& nodes) { + NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); + descr_.Init(schema); + return FromParquetSchema(&descr_, &result_schema_); + } + + protected: + parquet_cpp::SchemaDescriptor descr_; + std::shared_ptr<Schema> result_schema_; +}; + +TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { + std::vector<NodePtr> parquet_fields; + std::vector<std::shared_ptr<Field>> arrow_fields; + + parquet_fields.push_back( + PrimitiveNode::Make("boolean", Repetition::REQUIRED, parquet_cpp::Type::BOOLEAN)); + arrow_fields.push_back(std::make_shared<Field>("boolean", BOOL, false)); + + parquet_fields.push_back( + PrimitiveNode::Make("int32", Repetition::REQUIRED, parquet_cpp::Type::INT32)); + arrow_fields.push_back(std::make_shared<Field>("int32", INT32, false)); + + parquet_fields.push_back( + PrimitiveNode::Make("int64", Repetition::REQUIRED, parquet_cpp::Type::INT64)); + arrow_fields.push_back(std::make_shared<Field>("int64", INT64, false)); + + parquet_fields.push_back( + PrimitiveNode::Make("float", Repetition::OPTIONAL, parquet_cpp::Type::FLOAT)); + arrow_fields.push_back(std::make_shared<Field>("float", FLOAT)); + + parquet_fields.push_back( + PrimitiveNode::Make("double", Repetition::OPTIONAL, parquet_cpp::Type::DOUBLE)); + arrow_fields.push_back(std::make_shared<Field>("double", DOUBLE)); + + parquet_fields.push_back( + PrimitiveNode::Make("binary", Repetition::OPTIONAL, + parquet_cpp::Type::BYTE_ARRAY)); + arrow_fields.push_back(std::make_shared<Field>("binary", BINARY)); + + parquet_fields.push_back( + PrimitiveNode::Make("string", Repetition::OPTIONAL, + parquet_cpp::Type::BYTE_ARRAY, + parquet_cpp::LogicalType::UTF8)); + arrow_fields.push_back(std::make_shared<Field>("string", UTF8)); + + parquet_fields.push_back( + PrimitiveNode::Make("flba-binary", Repetition::OPTIONAL, + parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY, + parquet_cpp::LogicalType::NONE, 12)); + arrow_fields.push_back(std::make_shared<Field>("flba-binary", BINARY)); + + auto arrow_schema = std::make_shared<Schema>(arrow_fields); + ASSERT_OK(ConvertSchema(parquet_fields)); + + CheckFlatSchema(arrow_schema); +} + +TEST_F(TestConvertParquetSchema, UnsupportedThings) { + std::vector<NodePtr> unsupported_nodes; + + unsupported_nodes.push_back( + PrimitiveNode::Make("int96", Repetition::REQUIRED, parquet_cpp::Type::INT96)); + + unsupported_nodes.push_back( + GroupNode::Make("repeated-group", Repetition::REPEATED, {})); + + unsupported_nodes.push_back( + PrimitiveNode::Make("int32", Repetition::OPTIONAL, + parquet_cpp::Type::INT32, parquet_cpp::LogicalType::DATE)); + + unsupported_nodes.push_back( + PrimitiveNode::Make("int64", Repetition::OPTIONAL, + parquet_cpp::Type::INT64, parquet_cpp::LogicalType::TIMESTAMP_MILLIS)); + + for (const NodePtr& node : unsupported_nodes) { + ASSERT_RAISES(NotImplemented, ConvertSchema({node})); + } +} + +TEST(TestNodeConversion, DateAndTime) { +} + +} // namespace parquet + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/d3cb6b47/cpp/src/arrow/parquet/schema.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc new file mode 100644 index 0000000..6b1de57 --- /dev/null +++ b/cpp/src/arrow/parquet/schema.cc @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/parquet/schema.h" + +#include <vector> + +#include "parquet/api/schema.h" + +#include "arrow/util/status.h" +#include "arrow/types/decimal.h" + +using parquet_cpp::schema::Node; +using parquet_cpp::schema::NodePtr; +using parquet_cpp::schema::GroupNode; +using parquet_cpp::schema::PrimitiveNode; + +using parquet_cpp::LogicalType; + +namespace arrow { + +namespace parquet { + +const auto BOOL = std::make_shared<BooleanType>(); +const auto UINT8 = std::make_shared<UInt8Type>(); +const auto INT32 = std::make_shared<Int32Type>(); +const auto INT64 = std::make_shared<Int64Type>(); +const auto FLOAT = std::make_shared<FloatType>(); +const auto DOUBLE = std::make_shared<DoubleType>(); +const auto UTF8 = std::make_shared<StringType>(); +const auto BINARY = std::make_shared<ListType>( + std::make_shared<Field>("", UINT8)); + +TypePtr MakeDecimalType(const PrimitiveNode* node) { + int precision = node->decimal_metadata().precision; + int scale = node->decimal_metadata().scale; + return std::make_shared<DecimalType>(precision, scale); +} + +static Status FromByteArray(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::UTF8: + *out = UTF8; + break; + default: + // BINARY + *out = BINARY; + break; + } + return Status::OK(); +} + +static Status FromFLBA(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::NONE: + *out = BINARY; + break; + case LogicalType::DECIMAL: + *out = MakeDecimalType(node); + break; + default: + return Status::NotImplemented("unhandled type"); + break; + } + + return Status::OK(); +} + +static Status FromInt32(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::NONE: + *out = INT32; + break; + default: + return Status::NotImplemented("Unhandled logical type for int32"); + break; + } + return Status::OK(); +} + +static Status FromInt64(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::NONE: + *out = INT64; + break; + default: + return Status::NotImplemented("Unhandled logical type for int64"); + break; + } + return Status::OK(); +} + +// TODO: Logical Type Handling +Status NodeToField(const NodePtr& node, std::shared_ptr<Field>* out) { + std::shared_ptr<DataType> type; + + if (node->is_repeated()) { + return Status::NotImplemented("No support yet for repeated node types"); + } + + if (node->is_group()) { + const GroupNode* group = static_cast<const GroupNode*>(node.get()); + std::vector<std::shared_ptr<Field>> fields(group->field_count()); + for (int i = 0; i < group->field_count(); i++) { + RETURN_NOT_OK(NodeToField(group->field(i), &fields[i])); + } + type = std::make_shared<StructType>(fields); + } else { + // Primitive (leaf) node + const PrimitiveNode* primitive = static_cast<const PrimitiveNode*>(node.get()); + + switch (primitive->physical_type()) { + case parquet_cpp::Type::BOOLEAN: + type = BOOL; + break; + case parquet_cpp::Type::INT32: + RETURN_NOT_OK(FromInt32(primitive, &type)); + break; + case parquet_cpp::Type::INT64: + RETURN_NOT_OK(FromInt64(primitive, &type)); + break; + case parquet_cpp::Type::INT96: + // TODO: Do we have that type in Arrow? + // type = TypePtr(new Int96Type()); + return Status::NotImplemented("int96"); + case parquet_cpp::Type::FLOAT: + type = FLOAT; + break; + case parquet_cpp::Type::DOUBLE: + type = DOUBLE; + break; + case parquet_cpp::Type::BYTE_ARRAY: + // TODO: Do we have that type in Arrow? + RETURN_NOT_OK(FromByteArray(primitive, &type)); + break; + case parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY: + RETURN_NOT_OK(FromFLBA(primitive, &type)); + break; + } + } + + *out = std::make_shared<Field>(node->name(), type, !node->is_required()); + return Status::OK(); +} + +Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema, + std::shared_ptr<Schema>* out) { + // TODO(wesm): Consider adding an arrow::Schema name attribute, which comes + // from the root Parquet node + const GroupNode* schema_node = static_cast<const GroupNode*>( + parquet_schema->schema().get()); + + std::vector<std::shared_ptr<Field>> fields(schema_node->field_count()); + for (int i = 0; i < schema_node->field_count(); i++) { + RETURN_NOT_OK(NodeToField(schema_node->field(i), &fields[i])); + } + + *out = std::make_shared<Schema>(fields); + return Status::OK(); +} + +} // namespace parquet + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/d3cb6b47/cpp/src/arrow/parquet/schema.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h new file mode 100644 index 0000000..61de193 --- /dev/null +++ b/cpp/src/arrow/parquet/schema.h @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PARQUET_SCHEMA_H +#define ARROW_PARQUET_SCHEMA_H + +#include <memory> + +#include "parquet/api/schema.h" + +#include "arrow/schema.h" +#include "arrow/type.h" + +namespace arrow { + +class Status; + +namespace parquet { + +Status NodeToField(const parquet_cpp::schema::NodePtr& node, + std::shared_ptr<Field>* out); + +Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema, + std::shared_ptr<Schema>* out); + +} // namespace parquet + +} // namespace arrow + +#endif http://git-wip-us.apache.org/repos/asf/arrow/blob/d3cb6b47/cpp/src/arrow/types/decimal.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/decimal.cc b/cpp/src/arrow/types/decimal.cc new file mode 100644 index 0000000..f120c1a --- /dev/null +++ b/cpp/src/arrow/types/decimal.cc @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/decimal.h" + +#include <sstream> +#include <string> + +namespace arrow { + +std::string DecimalType::ToString() const { + std::stringstream s; + s << "decimal(" << precision << ", " << scale << ")"; + return s.str(); +} + +} // namespace arrow + http://git-wip-us.apache.org/repos/asf/arrow/blob/d3cb6b47/cpp/src/arrow/types/decimal.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h index 464c3ff..26243b4 100644 --- a/cpp/src/arrow/types/decimal.h +++ b/cpp/src/arrow/types/decimal.h @@ -18,13 +18,24 @@ #ifndef ARROW_TYPES_DECIMAL_H #define ARROW_TYPES_DECIMAL_H +#include <string> + #include "arrow/type.h" namespace arrow { struct DecimalType : public DataType { + explicit DecimalType(int precision_, int scale_) + : DataType(Type::DECIMAL), precision(precision_), + scale(scale_) { } int precision; int scale; + + static char const *name() { + return "decimal"; + } + + std::string ToString() const override; }; } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/d3cb6b47/cpp/src/arrow/util/status.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index b593123..4e273ed 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -109,6 +109,7 @@ class Status { bool IsKeyError() const { return code() == StatusCode::KeyError; } bool IsInvalid() const { return code() == StatusCode::Invalid; } bool IsIOError() const { return code() == StatusCode::IOError; } + bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; } // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. http://git-wip-us.apache.org/repos/asf/arrow/blob/d3cb6b47/format/Layout.md ---------------------------------------------------------------------- diff --git a/format/Layout.md b/format/Layout.md index 2d46ece..1b532c6 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -58,7 +58,7 @@ Base requirements * Memory layout and random access patterns for each relative type * Null value representation -## Non-goals (for this document +## Non-goals (for this document) * To enumerate or specify logical types that can be implemented as primitive (fixed-width) value types. For example: signed and unsigned integers,