Repository: parquet-cpp Updated Branches: refs/heads/master 257e65b81 -> 13da51d3f
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/schema-types-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/schema-types-test.cc b/src/parquet/schema/schema-types-test.cc deleted file mode 100644 index 37c8b14..0000000 --- a/src/parquet/schema/schema-types-test.cc +++ /dev/null @@ -1,311 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <gtest/gtest.h> - -#include <memory> -#include <string> -#include <vector> - -#include "parquet/exception.h" -#include "parquet/schema/test-util.h" -#include "parquet/schema/types.h" -#include "parquet/thrift/parquet_types.h" -#include "parquet/types.h" - -using std::string; -using std::vector; - -namespace parquet { - -namespace schema { - -// ---------------------------------------------------------------------- -// ColumnPath - -TEST(TestColumnPath, TestAttrs) { - ColumnPath path(std::vector<std::string>({"toplevel", "leaf"})); - - ASSERT_EQ(path.ToDotString(), "toplevel.leaf"); - - std::shared_ptr<ColumnPath> path_ptr = ColumnPath::FromDotString("toplevel.leaf"); - ASSERT_EQ(path_ptr->ToDotString(), "toplevel.leaf"); - - std::shared_ptr<ColumnPath> extended = path_ptr->extend("anotherlevel"); - ASSERT_EQ(extended->ToDotString(), "toplevel.leaf.anotherlevel"); -} - -// ---------------------------------------------------------------------- -// Primitive node - -class TestPrimitiveNode : public ::testing::Test { - public: - void SetUp() { - name_ = "name"; - id_ = 5; - } - - void Convert(const format::SchemaElement* element) { - node_ = PrimitiveNode::FromParquet(element, id_); - ASSERT_TRUE(node_->is_primitive()); - prim_node_ = static_cast<const PrimitiveNode*>(node_.get()); - } - - protected: - std::string name_; - const PrimitiveNode* prim_node_; - - int id_; - std::unique_ptr<Node> node_; -}; - -TEST_F(TestPrimitiveNode, Attrs) { - PrimitiveNode node1("foo", Repetition::REPEATED, Type::INT32); - - PrimitiveNode node2("bar", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::UTF8); - - ASSERT_EQ("foo", node1.name()); - - ASSERT_TRUE(node1.is_primitive()); - ASSERT_FALSE(node1.is_group()); - - ASSERT_EQ(Repetition::REPEATED, node1.repetition()); - ASSERT_EQ(Repetition::OPTIONAL, node2.repetition()); - - ASSERT_EQ(Node::PRIMITIVE, node1.node_type()); - - ASSERT_EQ(Type::INT32, node1.physical_type()); - ASSERT_EQ(Type::BYTE_ARRAY, node2.physical_type()); - - // logical types - ASSERT_EQ(LogicalType::NONE, node1.logical_type()); - ASSERT_EQ(LogicalType::UTF8, node2.logical_type()); - - // repetition - node1 = PrimitiveNode("foo", Repetition::REQUIRED, Type::INT32); - node2 = PrimitiveNode("foo", Repetition::OPTIONAL, Type::INT32); - PrimitiveNode node3("foo", Repetition::REPEATED, Type::INT32); - - ASSERT_TRUE(node1.is_required()); - - ASSERT_TRUE(node2.is_optional()); - ASSERT_FALSE(node2.is_required()); - - ASSERT_TRUE(node3.is_repeated()); - ASSERT_FALSE(node3.is_optional()); -} - -TEST_F(TestPrimitiveNode, FromParquet) { - SchemaElement elt = - NewPrimitive(name_, FieldRepetitionType::OPTIONAL, format::Type::INT32, 0); - Convert(&elt); - ASSERT_EQ(name_, prim_node_->name()); - ASSERT_EQ(id_, prim_node_->id()); - ASSERT_EQ(Repetition::OPTIONAL, prim_node_->repetition()); - ASSERT_EQ(Type::INT32, prim_node_->physical_type()); - ASSERT_EQ(LogicalType::NONE, prim_node_->logical_type()); - - // Test a logical type - elt = NewPrimitive(name_, FieldRepetitionType::REQUIRED, format::Type::BYTE_ARRAY, 0); - elt.__set_converted_type(ConvertedType::UTF8); - - Convert(&elt); - ASSERT_EQ(Repetition::REQUIRED, prim_node_->repetition()); - ASSERT_EQ(Type::BYTE_ARRAY, prim_node_->physical_type()); - ASSERT_EQ(LogicalType::UTF8, prim_node_->logical_type()); - - // FIXED_LEN_BYTE_ARRAY - elt = NewPrimitive( - name_, FieldRepetitionType::OPTIONAL, format::Type::FIXED_LEN_BYTE_ARRAY, 0); - elt.__set_type_length(16); - - Convert(&elt); - ASSERT_EQ(name_, prim_node_->name()); - ASSERT_EQ(id_, prim_node_->id()); - ASSERT_EQ(Repetition::OPTIONAL, prim_node_->repetition()); - ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type()); - ASSERT_EQ(16, prim_node_->type_length()); - - // ConvertedType::Decimal - elt = NewPrimitive( - name_, FieldRepetitionType::OPTIONAL, format::Type::FIXED_LEN_BYTE_ARRAY, 0); - elt.__set_converted_type(ConvertedType::DECIMAL); - elt.__set_type_length(6); - elt.__set_scale(2); - elt.__set_precision(12); - - Convert(&elt); - ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type()); - ASSERT_EQ(LogicalType::DECIMAL, prim_node_->logical_type()); - ASSERT_EQ(6, prim_node_->type_length()); - ASSERT_EQ(2, prim_node_->decimal_metadata().scale); - ASSERT_EQ(12, prim_node_->decimal_metadata().precision); -} - -TEST_F(TestPrimitiveNode, Equals) { - PrimitiveNode node1("foo", Repetition::REQUIRED, Type::INT32); - PrimitiveNode node2("foo", Repetition::REQUIRED, Type::INT64); - PrimitiveNode node3("bar", Repetition::REQUIRED, Type::INT32); - PrimitiveNode node4("foo", Repetition::OPTIONAL, Type::INT32); - PrimitiveNode node5("foo", Repetition::REQUIRED, Type::INT32); - - ASSERT_TRUE(node1.Equals(&node1)); - ASSERT_FALSE(node1.Equals(&node2)); - ASSERT_FALSE(node1.Equals(&node3)); - ASSERT_FALSE(node1.Equals(&node4)); - ASSERT_TRUE(node1.Equals(&node5)); - - PrimitiveNode flba1("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, - LogicalType::DECIMAL, 12, 4, 2); - - PrimitiveNode flba2("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, - LogicalType::DECIMAL, 1, 4, 2); - flba2.SetTypeLength(12); - - PrimitiveNode flba3("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, - LogicalType::DECIMAL, 1, 4, 2); - flba3.SetTypeLength(16); - - PrimitiveNode flba4("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, - LogicalType::DECIMAL, 12, 4, 0); - - PrimitiveNode flba5("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, - LogicalType::NONE, 12, 4, 0); - - ASSERT_TRUE(flba1.Equals(&flba2)); - ASSERT_FALSE(flba1.Equals(&flba3)); - ASSERT_FALSE(flba1.Equals(&flba4)); - ASSERT_FALSE(flba1.Equals(&flba5)); -} - -TEST_F(TestPrimitiveNode, PhysicalLogicalMapping) { - ASSERT_NO_THROW( - PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::INT32, LogicalType::INT_32)); - ASSERT_NO_THROW(PrimitiveNode::Make( - "foo", Repetition::REQUIRED, Type::BYTE_ARRAY, LogicalType::JSON)); - ASSERT_THROW( - PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::INT32, LogicalType::JSON), - ParquetException); - ASSERT_NO_THROW(PrimitiveNode::Make( - "foo", Repetition::REQUIRED, Type::INT64, LogicalType::TIMESTAMP_MILLIS)); - ASSERT_THROW( - PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::INT32, LogicalType::INT_64), - ParquetException); - ASSERT_THROW(PrimitiveNode::Make( - "foo", Repetition::REQUIRED, Type::BYTE_ARRAY, LogicalType::INT_8), - ParquetException); - ASSERT_THROW(PrimitiveNode::Make( - "foo", Repetition::REQUIRED, Type::BYTE_ARRAY, LogicalType::INTERVAL), - ParquetException); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::ENUM), - ParquetException); - ASSERT_NO_THROW(PrimitiveNode::Make( - "foo", Repetition::REQUIRED, Type::BYTE_ARRAY, LogicalType::ENUM)); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 0, 2, 4), - ParquetException); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, Type::FLOAT, - LogicalType::DECIMAL, 0, 2, 4), - ParquetException); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 0, 4, 0), - ParquetException); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 0, 4), - ParquetException); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 4, -1), - ParquetException); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 2, 4), - ParquetException); - ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 6, 4)); - ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 12)); - ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 10), - ParquetException); -} - -// ---------------------------------------------------------------------- -// Group node - -class TestGroupNode : public ::testing::Test { - public: - NodeVector Fields1() { - NodeVector fields; - - fields.push_back(Int32("one", Repetition::REQUIRED)); - fields.push_back(Int64("two")); - fields.push_back(Double("three")); - - return fields; - } -}; - -TEST_F(TestGroupNode, Attrs) { - NodeVector fields = Fields1(); - - GroupNode node1("foo", Repetition::REPEATED, fields); - GroupNode node2("bar", Repetition::OPTIONAL, fields, LogicalType::LIST); - - ASSERT_EQ("foo", node1.name()); - - ASSERT_TRUE(node1.is_group()); - ASSERT_FALSE(node1.is_primitive()); - - ASSERT_EQ(fields.size(), node1.field_count()); - - ASSERT_TRUE(node1.is_repeated()); - ASSERT_TRUE(node2.is_optional()); - - ASSERT_EQ(Repetition::REPEATED, node1.repetition()); - ASSERT_EQ(Repetition::OPTIONAL, node2.repetition()); - - ASSERT_EQ(Node::GROUP, node1.node_type()); - - // logical types - ASSERT_EQ(LogicalType::NONE, node1.logical_type()); - ASSERT_EQ(LogicalType::LIST, node2.logical_type()); -} - -TEST_F(TestGroupNode, Equals) { - NodeVector f1 = Fields1(); - NodeVector f2 = Fields1(); - - GroupNode group1("group", Repetition::REPEATED, f1); - GroupNode group2("group", Repetition::REPEATED, f2); - GroupNode group3("group2", Repetition::REPEATED, f2); - - // This is copied in the GroupNode ctor, so this is okay - f2.push_back(Float("four", Repetition::OPTIONAL)); - GroupNode group4("group", Repetition::REPEATED, f2); - GroupNode group5("group", Repetition::REPEATED, Fields1()); - - ASSERT_TRUE(group1.Equals(&group1)); - ASSERT_TRUE(group1.Equals(&group2)); - ASSERT_FALSE(group1.Equals(&group3)); - - ASSERT_FALSE(group1.Equals(&group4)); - ASSERT_FALSE(group5.Equals(&group4)); -} - -} // namespace schema - -} // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/test-util.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/test-util.h b/src/parquet/schema/test-util.h deleted file mode 100644 index 752b8f3..0000000 --- a/src/parquet/schema/test-util.h +++ /dev/null @@ -1,63 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This module defines an abstract interface for iterating through pages in a -// Parquet column chunk within a row group. It could be extended in the future -// to iterate through all data pages in all chunks in a file. - -#ifndef PARQUET_SCHEMA_TEST_UTIL_H -#define PARQUET_SCHEMA_TEST_UTIL_H - -#include <string> - -#include "parquet/schema/types.h" -#include "parquet/thrift/parquet_types.h" - -using parquet::format::ConvertedType; -using parquet::format::FieldRepetitionType; -using parquet::format::SchemaElement; - -namespace parquet { - -namespace schema { - -static inline SchemaElement NewPrimitive(const std::string& name, - FieldRepetitionType::type repetition, format::Type::type type, int id = 0) { - SchemaElement result; - result.__set_name(name); - result.__set_repetition_type(repetition); - result.__set_type(type); - result.__set_num_children(0); - - return result; -} - -static inline SchemaElement NewGroup(const std::string& name, - FieldRepetitionType::type repetition, int num_children, int id = 0) { - SchemaElement result; - result.__set_name(name); - result.__set_repetition_type(repetition); - result.__set_num_children(num_children); - - return result; -} - -} // namespace schema - -} // namespace parquet - -#endif // PARQUET_COLUMN_TEST_UTIL_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/types.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc deleted file mode 100644 index 7d452c3..0000000 --- a/src/parquet/schema/types.cc +++ /dev/null @@ -1,315 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/schema/types.h" - -#include <algorithm> -#include <memory> - -#include "parquet/exception.h" -#include "parquet/thrift/parquet_types.h" -#include "parquet/thrift/util.h" - -namespace parquet { - -namespace schema { - -// ---------------------------------------------------------------------- -// ColumnPath - -std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const std::string& dotstring) { - std::stringstream ss(dotstring); - std::string item; - std::vector<std::string> path; - while (std::getline(ss, item, '.')) { - path.push_back(item); - } - return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path))); -} - -std::shared_ptr<ColumnPath> ColumnPath::extend(const std::string& node_name) const { - std::vector<std::string> path; - path.reserve(path_.size() + 1); - path.resize(path_.size() + 1); - std::copy(path_.cbegin(), path_.cend(), path.begin()); - path[path_.size()] = node_name; - - return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path))); -} - -std::string ColumnPath::ToDotString() const { - std::stringstream ss; - for (auto it = path_.cbegin(); it != path_.cend(); ++it) { - if (it != path_.cbegin()) { ss << "."; } - ss << *it; - } - return ss.str(); -} - -const std::vector<std::string>& ColumnPath::ToDotVector() const { - return path_; -} - -// ---------------------------------------------------------------------- -// Base node - -bool Node::EqualsInternal(const Node* other) const { - return type_ == other->type_ && name_ == other->name_ && - repetition_ == other->repetition_ && logical_type_ == other->logical_type_; -} - -void Node::SetParent(const Node* parent) { - parent_ = parent; -} - -// ---------------------------------------------------------------------- -// Primitive node - -PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition, - Type::type type, LogicalType::type logical_type, int length, int precision, int scale, - int id) - : Node(Node::PRIMITIVE, name, repetition, logical_type, id), - physical_type_(type), - type_length_(length) { - std::stringstream ss; - - // PARQUET-842: In an earlier revision, decimal_metadata_.isset was being - // set to true, but Impala will raise an incompatible metadata in such cases - memset(&decimal_metadata_, 0, sizeof(decimal_metadata_)); - - // Check if the physical and logical types match - // Mapping referred from Apache parquet-mr as on 2016-02-22 - switch (logical_type) { - case LogicalType::NONE: - // Logical type not set - break; - case LogicalType::UTF8: - case LogicalType::JSON: - case LogicalType::BSON: - if (type != Type::BYTE_ARRAY) { - ss << LogicalTypeToString(logical_type); - ss << " can only annotate BYTE_ARRAY fields"; - throw ParquetException(ss.str()); - } - break; - case LogicalType::DECIMAL: - if ((type != Type::INT32) && (type != Type::INT64) && (type != Type::BYTE_ARRAY) && - (type != Type::FIXED_LEN_BYTE_ARRAY)) { - ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED"; - throw ParquetException(ss.str()); - } - if (precision <= 0) { - ss << "Invalid DECIMAL precision: " << precision; - throw ParquetException(ss.str()); - } - if (scale < 0) { - ss << "Invalid DECIMAL scale: " << scale; - throw ParquetException(ss.str()); - } - if (scale > precision) { - ss << "Invalid DECIMAL scale " << scale; - ss << " cannot be greater than precision " << precision; - throw ParquetException(ss.str()); - } - decimal_metadata_.isset = true; - decimal_metadata_.precision = precision; - decimal_metadata_.scale = scale; - break; - case LogicalType::DATE: - case LogicalType::TIME_MILLIS: - case LogicalType::UINT_8: - case LogicalType::UINT_16: - case LogicalType::UINT_32: - case LogicalType::INT_8: - case LogicalType::INT_16: - case LogicalType::INT_32: - if (type != Type::INT32) { - ss << LogicalTypeToString(logical_type); - ss << " can only annotate INT32"; - throw ParquetException(ss.str()); - } - break; - case LogicalType::TIME_MICROS: - case LogicalType::TIMESTAMP_MILLIS: - case LogicalType::TIMESTAMP_MICROS: - case LogicalType::UINT_64: - case LogicalType::INT_64: - if (type != Type::INT64) { - ss << LogicalTypeToString(logical_type); - ss << " can only annotate INT64"; - throw ParquetException(ss.str()); - } - break; - case LogicalType::INTERVAL: - if ((type != Type::FIXED_LEN_BYTE_ARRAY) || (length != 12)) { - ss << "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)"; - throw ParquetException(ss.str()); - } - break; - case LogicalType::ENUM: - if (type != Type::BYTE_ARRAY) { - ss << "ENUM can only annotate BYTE_ARRAY fields"; - throw ParquetException(ss.str()); - } - break; - default: - ss << LogicalTypeToString(logical_type); - ss << " can not be applied to a primitive type"; - throw ParquetException(ss.str()); - } - if (type == Type::FIXED_LEN_BYTE_ARRAY) { - if (length <= 0) { - ss << "Invalid FIXED_LEN_BYTE_ARRAY length: " << length; - throw ParquetException(ss.str()); - } - type_length_ = length; - } -} - -bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const { - bool is_equal = true; - if ((physical_type_ != other->physical_type_) || - (logical_type_ != other->logical_type_)) { - return false; - } - if (logical_type_ == LogicalType::DECIMAL) { - is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision) && - (decimal_metadata_.scale == other->decimal_metadata_.scale); - } - if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) { - is_equal &= (type_length_ == other->type_length_); - } - return is_equal; -} - -bool PrimitiveNode::Equals(const Node* other) const { - if (!Node::EqualsInternal(other)) { return false; } - return EqualsInternal(static_cast<const PrimitiveNode*>(other)); -} - -void PrimitiveNode::Visit(Node::Visitor* visitor) { - visitor->Visit(this); -} - -void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const { - visitor->Visit(this); -} - -// ---------------------------------------------------------------------- -// Group node - -bool GroupNode::EqualsInternal(const GroupNode* other) const { - if (this == other) { return true; } - if (this->field_count() != other->field_count()) { return false; } - for (int i = 0; i < this->field_count(); ++i) { - if (!this->field(i)->Equals(other->field(i).get())) { return false; } - } - return true; -} - -bool GroupNode::Equals(const Node* other) const { - if (!Node::EqualsInternal(other)) { return false; } - return EqualsInternal(static_cast<const GroupNode*>(other)); -} - -void GroupNode::Visit(Node::Visitor* visitor) { - visitor->Visit(this); -} - -void GroupNode::VisitConst(Node::ConstVisitor* visitor) const { - visitor->Visit(this); -} - -// ---------------------------------------------------------------------- -// Node construction from Parquet metadata - -struct NodeParams { - explicit NodeParams(const std::string& name) : name(name) {} - - const std::string& name; - Repetition::type repetition; - LogicalType::type logical_type; -}; - -static inline NodeParams GetNodeParams(const format::SchemaElement* element) { - NodeParams params(element->name); - - params.repetition = FromThrift(element->repetition_type); - if (element->__isset.converted_type) { - params.logical_type = FromThrift(element->converted_type); - } else { - params.logical_type = LogicalType::NONE; - } - return params; -} - -std::unique_ptr<Node> GroupNode::FromParquet( - const void* opaque_element, int node_id, const NodeVector& fields) { - const format::SchemaElement* element = - static_cast<const format::SchemaElement*>(opaque_element); - NodeParams params = GetNodeParams(element); - return std::unique_ptr<Node>(new GroupNode( - params.name, params.repetition, fields, params.logical_type, node_id)); -} - -std::unique_ptr<Node> PrimitiveNode::FromParquet( - const void* opaque_element, int node_id) { - const format::SchemaElement* element = - static_cast<const format::SchemaElement*>(opaque_element); - NodeParams params = GetNodeParams(element); - - std::unique_ptr<PrimitiveNode> result = - std::unique_ptr<PrimitiveNode>(new PrimitiveNode(params.name, params.repetition, - FromThrift(element->type), params.logical_type, element->type_length, - element->precision, element->scale, node_id)); - - // Return as unique_ptr to the base type - return std::unique_ptr<Node>(result.release()); -} - -void GroupNode::ToParquet(void* opaque_element) const { - format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element); - element->__set_name(name_); - element->__set_num_children(field_count()); - element->__set_repetition_type(ToThrift(repetition_)); - if (logical_type_ != LogicalType::NONE) { - element->__set_converted_type(ToThrift(logical_type_)); - } -} - -void PrimitiveNode::ToParquet(void* opaque_element) const { - format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element); - - element->__set_name(name_); - element->__set_num_children(0); - element->__set_repetition_type(ToThrift(repetition_)); - if (logical_type_ != LogicalType::NONE) { - element->__set_converted_type(ToThrift(logical_type_)); - } - element->__set_type(ToThrift(physical_type_)); - if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) { - element->__set_type_length(type_length_); - } - if (decimal_metadata_.isset) { - element->__set_precision(decimal_metadata_.precision); - element->__set_scale(decimal_metadata_.scale); - } -} - -} // namespace schema - -} // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/types.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/types.h b/src/parquet/schema/types.h deleted file mode 100644 index f315480..0000000 --- a/src/parquet/schema/types.h +++ /dev/null @@ -1,292 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This module contains the logical parquet-cpp types (independent of Thrift -// structures), schema nodes, and related type tools - -#ifndef PARQUET_SCHEMA_TYPES_H -#define PARQUET_SCHEMA_TYPES_H - -#include <cstdint> -#include <memory> -#include <string> -#include <vector> - -#include "parquet/types.h" -#include "parquet/util/macros.h" -#include "parquet/util/visibility.h" - -namespace parquet { -namespace schema { - -// List encodings: using the terminology from Impala to define different styles -// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since -// the converted type named in the Parquet metadata is ConvertedType::LIST we -// use that terminology here. It also helps distinguish from the *_ARRAY -// primitive types. -// -// One-level encoding: Only allows required lists with required cells -// repeated value_type name -// -// Two-level encoding: Enables optional lists with only required cells -// <required/optional> group list -// repeated value_type item -// -// Three-level encoding: Enables optional lists with optional cells -// <required/optional> group bag -// repeated group list -// <required/optional> value_type item -// -// 2- and 1-level encoding are respectively equivalent to 3-level encoding with -// the non-repeated nodes set to required. -// -// The "official" encoding recommended in the Parquet spec is the 3-level, and -// we use that as the default when creating list types. For semantic completeness -// we allow the other two. Since all types of encodings will occur "in the -// wild" we need to be able to interpret the associated definition levels in -// the context of the actual encoding used in the file. -// -// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated -// SchemaElement, which could make things challenging if we are trying to infer -// that a sequence of nodes semantically represents an array according to one -// of these encodings (versus a struct containing an array). We should refuse -// the temptation to guess, as they say. -struct ListEncoding { - enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL }; -}; - -struct DecimalMetadata { - bool isset; - int32_t scale; - int32_t precision; -}; - -class PARQUET_EXPORT ColumnPath { - public: - ColumnPath() : path_() {} - explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {} - explicit ColumnPath(std::vector<std::string>&& path) : path_(path) {} - - static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring); - - std::shared_ptr<ColumnPath> extend(const std::string& node_name) const; - std::string ToDotString() const; - const std::vector<std::string>& ToDotVector() const; - - protected: - std::vector<std::string> path_; -}; - -class GroupNode; - -// Base class for logical schema types. A type has a name, repetition level, -// and optionally a logical type (ConvertedType in Parquet metadata parlance) -class PARQUET_EXPORT Node { - public: - enum type { PRIMITIVE, GROUP }; - - Node(Node::type type, const std::string& name, Repetition::type repetition, - LogicalType::type logical_type = LogicalType::NONE, int id = -1) - : type_(type), - name_(name), - repetition_(repetition), - logical_type_(logical_type), - id_(id), - parent_(nullptr) {} - - virtual ~Node() {} - - bool is_primitive() const { return type_ == Node::PRIMITIVE; } - - bool is_group() const { return type_ == Node::GROUP; } - - bool is_optional() const { return repetition_ == Repetition::OPTIONAL; } - - bool is_repeated() const { return repetition_ == Repetition::REPEATED; } - - bool is_required() const { return repetition_ == Repetition::REQUIRED; } - - virtual bool Equals(const Node* other) const = 0; - - const std::string& name() const { return name_; } - - Node::type node_type() const { return type_; } - - Repetition::type repetition() const { return repetition_; } - - LogicalType::type logical_type() const { return logical_type_; } - - int id() const { return id_; } - - const Node* parent() const { return parent_; } - - // ToParquet returns an opaque void* to avoid exporting - // parquet::SchemaElement into the public API - virtual void ToParquet(void* opaque_element) const = 0; - - // Node::Visitor abstract class for walking schemas with the visitor pattern - class Visitor { - public: - virtual ~Visitor() {} - - virtual void Visit(Node* node) = 0; - }; - class ConstVisitor { - public: - virtual ~ConstVisitor() {} - - virtual void Visit(const Node* node) = 0; - }; - - virtual void Visit(Visitor* visitor) = 0; - virtual void VisitConst(ConstVisitor* visitor) const = 0; - - protected: - friend class GroupNode; - - Node::type type_; - std::string name_; - Repetition::type repetition_; - LogicalType::type logical_type_; - int id_; - // Nodes should not be shared, they have a single parent. - const Node* parent_; - - bool EqualsInternal(const Node* other) const; - void SetParent(const Node* p_parent); -}; - -// Save our breath all over the place with these typedefs -typedef std::shared_ptr<Node> NodePtr; -typedef std::vector<NodePtr> NodeVector; - -// A type that is one of the primitive Parquet storage types. In addition to -// the other type metadata (name, repetition level, logical type), also has the -// physical storage type and their type-specific metadata (byte width, decimal -// parameters) -class PARQUET_EXPORT PrimitiveNode : public Node { - public: - // FromParquet accepts an opaque void* to avoid exporting - // parquet::SchemaElement into the public API - static std::unique_ptr<Node> FromParquet(const void* opaque_element, int id); - - static inline NodePtr Make(const std::string& name, Repetition::type repetition, - Type::type type, LogicalType::type logical_type = LogicalType::NONE, - int length = -1, int precision = -1, int scale = -1) { - return NodePtr(new PrimitiveNode( - name, repetition, type, logical_type, length, precision, scale)); - } - - bool Equals(const Node* other) const override; - - Type::type physical_type() const { return physical_type_; } - - int32_t type_length() const { return type_length_; } - - const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; } - - void ToParquet(void* opaque_element) const override; - void Visit(Visitor* visitor) override; - void VisitConst(ConstVisitor* visitor) const override; - - private: - PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type, - LogicalType::type logical_type = LogicalType::NONE, int length = -1, - int precision = -1, int scale = -1, int id = -1); - - Type::type physical_type_; - int32_t type_length_; - DecimalMetadata decimal_metadata_; - - // For FIXED_LEN_BYTE_ARRAY - void SetTypeLength(int32_t length) { type_length_ = length; } - - // For Decimal logical type: Precision and scale - void SetDecimalMetadata(int32_t scale, int32_t precision) { - decimal_metadata_.scale = scale; - decimal_metadata_.precision = precision; - } - - bool EqualsInternal(const PrimitiveNode* other) const; - - FRIEND_TEST(TestPrimitiveNode, Attrs); - FRIEND_TEST(TestPrimitiveNode, Equals); - FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping); - FRIEND_TEST(TestPrimitiveNode, FromParquet); -}; - -class PARQUET_EXPORT GroupNode : public Node { - public: - // Like PrimitiveNode, GroupNode::FromParquet accepts an opaque void* to avoid exporting - // parquet::SchemaElement into the public API - static std::unique_ptr<Node> FromParquet( - const void* opaque_element, int id, const NodeVector& fields); - - static inline NodePtr Make(const std::string& name, Repetition::type repetition, - const NodeVector& fields, LogicalType::type logical_type = LogicalType::NONE) { - return NodePtr(new GroupNode(name, repetition, fields, logical_type)); - } - - bool Equals(const Node* other) const override; - - const NodePtr& field(int i) const { return fields_[i]; } - - int field_count() const { return fields_.size(); } - - void ToParquet(void* opaque_element) const override; - void Visit(Visitor* visitor) override; - void VisitConst(ConstVisitor* visitor) const override; - - private: - GroupNode(const std::string& name, Repetition::type repetition, - const NodeVector& fields, LogicalType::type logical_type = LogicalType::NONE, - int id = -1) - : Node(Node::GROUP, name, repetition, logical_type, id), fields_(fields) { - for (NodePtr& field : fields_) { - field->SetParent(this); - } - } - - NodeVector fields_; - bool EqualsInternal(const GroupNode* other) const; - - FRIEND_TEST(TestGroupNode, Attrs); - FRIEND_TEST(TestGroupNode, Equals); -}; - -// ---------------------------------------------------------------------- -// Convenience primitive type factory functions - -#define PRIMITIVE_FACTORY(FuncName, TYPE) \ - static inline NodePtr FuncName( \ - const std::string& name, Repetition::type repetition = Repetition::OPTIONAL) { \ - return PrimitiveNode::Make(name, repetition, Type::TYPE); \ - } - -PRIMITIVE_FACTORY(Boolean, BOOLEAN); -PRIMITIVE_FACTORY(Int32, INT32); -PRIMITIVE_FACTORY(Int64, INT64); -PRIMITIVE_FACTORY(Int96, INT96); -PRIMITIVE_FACTORY(Float, FLOAT); -PRIMITIVE_FACTORY(Double, DOUBLE); -PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY); - -} // namespace schema - -} // namespace parquet - -#endif // PARQUET_SCHEMA_TYPES_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/util/comparison-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/util/comparison-test.cc b/src/parquet/util/comparison-test.cc index d2689ff..ec85485 100644 --- a/src/parquet/util/comparison-test.cc +++ b/src/parquet/util/comparison-test.cc @@ -21,7 +21,7 @@ #include <iostream> #include <vector> -#include "parquet/schema/descriptor.h" +#include "parquet/schema.h" #include "parquet/types.h" #include "parquet/util/comparison.h" http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/util/comparison.h ---------------------------------------------------------------------- diff --git a/src/parquet/util/comparison.h b/src/parquet/util/comparison.h index 5ca7520..103f4c5 100644 --- a/src/parquet/util/comparison.h +++ b/src/parquet/util/comparison.h @@ -20,7 +20,7 @@ #include <algorithm> -#include "parquet/schema/descriptor.h" +#include "parquet/schema.h" #include "parquet/types.h" namespace parquet {
