Repository: parquet-cpp Updated Branches: refs/heads/master c0eec9a59 -> 04d75c7cb
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/schema-types-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/schema-types-test.cc b/src/parquet/schema/schema-types-test.cc new file mode 100644 index 0000000..72d38c0 --- /dev/null +++ b/src/parquet/schema/schema-types-test.cc @@ -0,0 +1,231 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <cstdint> +#include <string> +#include <vector> + +#include <gtest/gtest.h> +#include "parquet/util/test-common.h" + +#include "parquet/schema/types.h" +#include "parquet/schema/test-util.h" + +using std::string; +using std::vector; + +namespace parquet_cpp { + +namespace schema { + +// ---------------------------------------------------------------------- +// Primitive node + +class TestPrimitiveNode : public ::testing::Test { + public: + void setUp() { + name_ = "name"; + id_ = 5; + } + + void Convert(const parquet::SchemaElement* element) { + node_ = PrimitiveNode::FromParquet(element, id_); + ASSERT_TRUE(node_->is_primitive()); + prim_node_ = static_cast<const PrimitiveNode*>(node_.get()); + } + + protected: + std::string name_; + const PrimitiveNode* prim_node_; + + int id_; + std::unique_ptr<Node> node_; +}; + +TEST_F(TestPrimitiveNode, Attrs) { + PrimitiveNode node1("foo", Repetition::REPEATED, Type::INT32); + + PrimitiveNode node2("bar", Repetition::OPTIONAL, Type::BYTE_ARRAY, + LogicalType::UTF8); + + ASSERT_EQ("foo", node1.name()); + + ASSERT_TRUE(node1.is_primitive()); + ASSERT_FALSE(node1.is_group()); + + ASSERT_EQ(Repetition::REPEATED, node1.repetition()); + ASSERT_EQ(Repetition::OPTIONAL, node2.repetition()); + + ASSERT_EQ(Node::PRIMITIVE, node1.node_type()); + + ASSERT_EQ(Type::INT32, node1.physical_type()); + ASSERT_EQ(Type::BYTE_ARRAY, node2.physical_type()); + + // logical types + ASSERT_EQ(LogicalType::NONE, node1.logical_type()); + ASSERT_EQ(LogicalType::UTF8, node2.logical_type()); + + // repetition + node1 = PrimitiveNode("foo", Repetition::REQUIRED, Type::INT32); + node2 = PrimitiveNode("foo", Repetition::OPTIONAL, Type::INT32); + PrimitiveNode node3("foo", Repetition::REPEATED, Type::INT32); + + ASSERT_TRUE(node1.is_required()); + + ASSERT_TRUE(node2.is_optional()); + ASSERT_FALSE(node2.is_required()); + + ASSERT_TRUE(node3.is_repeated()); + ASSERT_FALSE(node3.is_optional()); +} + +TEST_F(TestPrimitiveNode, FromParquet) { + SchemaElement elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, + parquet::Type::INT32); + Convert(&elt); + ASSERT_EQ(name_, prim_node_->name()); + ASSERT_EQ(id_, prim_node_->id()); + ASSERT_EQ(Repetition::OPTIONAL, prim_node_->repetition()); + ASSERT_EQ(Type::INT32, prim_node_->physical_type()); + ASSERT_EQ(LogicalType::NONE, prim_node_->logical_type()); + + // Test a logical type + elt = NewPrimitive(name_, FieldRepetitionType::REQUIRED, parquet::Type::BYTE_ARRAY); + elt.__set_converted_type(ConvertedType::UTF8); + + Convert(&elt); + ASSERT_EQ(Repetition::REQUIRED, prim_node_->repetition()); + ASSERT_EQ(Type::BYTE_ARRAY, prim_node_->physical_type()); + ASSERT_EQ(LogicalType::UTF8, prim_node_->logical_type()); + + // FIXED_LEN_BYTE_ARRAY + elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, + parquet::Type::FIXED_LEN_BYTE_ARRAY); + elt.__set_type_length(16); + + Convert(&elt); + ASSERT_EQ(name_, prim_node_->name()); + ASSERT_EQ(id_, prim_node_->id()); + ASSERT_EQ(Repetition::OPTIONAL, prim_node_->repetition()); + ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type()); + ASSERT_EQ(16, prim_node_->type_length()); + + // ConvertedType::Decimal + elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL, + parquet::Type::FIXED_LEN_BYTE_ARRAY); + elt.__set_converted_type(ConvertedType::DECIMAL); + elt.__set_type_length(6); + elt.__set_scale(12); + elt.__set_precision(2); + + Convert(&elt); + ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type()); + ASSERT_EQ(LogicalType::DECIMAL, prim_node_->logical_type()); + ASSERT_EQ(6, prim_node_->type_length()); + ASSERT_EQ(12, prim_node_->decimal_metadata().scale); + ASSERT_EQ(2, prim_node_->decimal_metadata().precision); +} + +TEST_F(TestPrimitiveNode, Equals) { + PrimitiveNode node1("foo", Repetition::REQUIRED, Type::INT32); + PrimitiveNode node2("foo", Repetition::REQUIRED, Type::INT64); + PrimitiveNode node3("bar", Repetition::REQUIRED, Type::INT32); + PrimitiveNode node4("foo", Repetition::OPTIONAL, Type::INT32); + PrimitiveNode node5("foo", Repetition::REQUIRED, Type::INT32); + + ASSERT_TRUE(node1.Equals(&node1)); + ASSERT_FALSE(node1.Equals(&node2)); + ASSERT_FALSE(node1.Equals(&node3)); + ASSERT_FALSE(node1.Equals(&node4)); + ASSERT_TRUE(node1.Equals(&node5)); + + PrimitiveNode flba1("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY); + flba1.SetTypeLength(12); + + PrimitiveNode flba2("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY); + flba2.SetTypeLength(12); + + PrimitiveNode flba3("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY); + flba3.SetTypeLength(16); + + ASSERT_TRUE(flba1.Equals(&flba2)); + ASSERT_FALSE(flba1.Equals(&flba3)); +} + +// ---------------------------------------------------------------------- +// Group node + +class TestGroupNode : public ::testing::Test { + public: + NodeVector Fields1() { + NodeVector fields; + + fields.push_back(Int32("one", Repetition::REQUIRED)); + fields.push_back(Int64("two")); + fields.push_back(Double("three")); + + return fields; + } +}; + +TEST_F(TestGroupNode, Attrs) { + NodeVector fields = Fields1(); + + GroupNode node1("foo", Repetition::REPEATED, fields); + GroupNode node2("bar", Repetition::OPTIONAL, fields, LogicalType::LIST); + + ASSERT_EQ("foo", node1.name()); + + ASSERT_TRUE(node1.is_group()); + ASSERT_FALSE(node1.is_primitive()); + + ASSERT_EQ(fields.size(), node1.field_count()); + + ASSERT_TRUE(node1.is_repeated()); + ASSERT_TRUE(node2.is_optional()); + + ASSERT_EQ(Repetition::REPEATED, node1.repetition()); + ASSERT_EQ(Repetition::OPTIONAL, node2.repetition()); + + ASSERT_EQ(Node::GROUP, node1.node_type()); + + // logical types + ASSERT_EQ(LogicalType::NONE, node1.logical_type()); + ASSERT_EQ(LogicalType::LIST, node2.logical_type()); +} + +TEST_F(TestGroupNode, Equals) { + NodeVector f1 = Fields1(); + NodeVector f2 = Fields1(); + + GroupNode group1("group", Repetition::REPEATED, f1); + GroupNode group2("group", Repetition::REPEATED, f2); + GroupNode group3("group2", Repetition::REPEATED, f2); + + // This is copied in the GroupNode ctor, so this is okay + f2.push_back(Float("four", Repetition::OPTIONAL)); + GroupNode group4("group", Repetition::REPEATED, f2); + + ASSERT_TRUE(group1.Equals(&group2)); + ASSERT_FALSE(group1.Equals(&group3)); + + ASSERT_FALSE(group1.Equals(&group4)); +} + +} // namespace schema + +} // namespace parquet_cpp http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/test-util.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/test-util.h b/src/parquet/schema/test-util.h new file mode 100644 index 0000000..5593abd --- /dev/null +++ b/src/parquet/schema/test-util.h @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#ifndef PARQUET_SCHEMA_TEST_UTIL_H +#define PARQUET_SCHEMA_TEST_UTIL_H + +#include <string> + +#include "parquet/schema/types.h" +#include "parquet/thrift/parquet_types.h" + +using parquet::ConvertedType; +using parquet::FieldRepetitionType; +using parquet::SchemaElement; + +namespace parquet_cpp { + +namespace schema { + +static inline SchemaElement NewPrimitive(const std::string& name, + FieldRepetitionType::type repetition, parquet::Type::type type) { + SchemaElement result; + result.__set_name(name); + result.__set_repetition_type(repetition); + result.__set_type(type); + result.__set_num_children(0); + + return result; +} + +static inline SchemaElement NewGroup(const std::string& name, + FieldRepetitionType::type repetition, size_t num_children) { + SchemaElement result; + result.__set_name(name); + result.__set_repetition_type(repetition); + result.__set_num_children(num_children); + + return result; +} + +} // namespace schema + +} // namespace parquet_cpp + +#endif // PARQUET_COLUMN_TEST_UTIL_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/types.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc new file mode 100644 index 0000000..e088eed --- /dev/null +++ b/src/parquet/schema/types.cc @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/schema/types.h" + +#include <memory> + +#include "parquet/thrift/parquet_types.h" + +namespace parquet_cpp { + +namespace schema { + +// ---------------------------------------------------------------------- +// Base node + +bool Node::EqualsInternal(const Node* other) const { + return type_ == other->type_ && + name_ == other->name_ && + repetition_ == other->repetition_ && + logical_type_ == other->logical_type_; +} + +// ---------------------------------------------------------------------- +// Primitive node + +bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const { + if (physical_type_ != other->physical_type_) { + return false; + } else if (logical_type_ == LogicalType::DECIMAL) { + // TODO(wesm): metadata + ParquetException::NYI("comparing decimals"); + return false; + } else if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) { + return type_length_ == other->type_length_; + } + return true; +} + +bool PrimitiveNode::Equals(const Node* other) const { + if (!Node::EqualsInternal(other)) { + return false; + } + return EqualsInternal(static_cast<const PrimitiveNode*>(other)); +} + +void PrimitiveNode::Visit(Node::Visitor* visitor) { + visitor->Visit(this); +} + +// ---------------------------------------------------------------------- +// Group node + +bool GroupNode::EqualsInternal(const GroupNode* other) const { + if (this == other) { + return true; + } + if (this->field_count() != other->field_count()) { + return false; + } + for (size_t i = 0; i < this->field_count(); ++i) { + if (!this->field(i)->Equals(other->field(i).get())) { + return false; + } + } + return true; +} + +bool GroupNode::Equals(const Node* other) const { + if (!Node::EqualsInternal(other)) { + return false; + } + return EqualsInternal(static_cast<const GroupNode*>(other)); +} + +void GroupNode::Visit(Node::Visitor* visitor) { + visitor->Visit(this); +} + +// ---------------------------------------------------------------------- +// Node construction from Parquet metadata + +static Type::type ConvertEnum(parquet::Type::type type) { + return static_cast<Type::type>(type); +} + +static LogicalType::type ConvertEnum(parquet::ConvertedType::type type) { + // item 0 is NONE + return static_cast<LogicalType::type>(static_cast<int>(type) + 1); +} + +static Repetition::type ConvertEnum(parquet::FieldRepetitionType::type type) { + return static_cast<Repetition::type>(type); +} + +struct NodeParams { + explicit NodeParams(const std::string& name) : + name(name) {} + + const std::string& name; + Repetition::type repetition; + LogicalType::type logical_type; +}; + +static inline NodeParams GetNodeParams(const parquet::SchemaElement* element) { + NodeParams params(element->name); + + params.repetition = ConvertEnum(element->repetition_type); + if (element->__isset.converted_type) { + params.logical_type = ConvertEnum(element->converted_type); + } else { + params.logical_type = LogicalType::NONE; + } + return params; +} + +std::unique_ptr<Node> GroupNode::FromParquet(const void* opaque_element, int node_id, + const NodeVector& fields) { + const parquet::SchemaElement* element = + static_cast<const parquet::SchemaElement*>(opaque_element); + NodeParams params = GetNodeParams(element); + return std::unique_ptr<Node>(new GroupNode(params.name, params.repetition, fields, + params.logical_type, node_id)); +} + +std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element, + int node_id) { + const parquet::SchemaElement* element = + static_cast<const parquet::SchemaElement*>(opaque_element); + NodeParams params = GetNodeParams(element); + + std::unique_ptr<PrimitiveNode> result = std::unique_ptr<PrimitiveNode>( + new PrimitiveNode(params.name, params.repetition, + ConvertEnum(element->type), params.logical_type, node_id)); + + if (element->type == parquet::Type::FIXED_LEN_BYTE_ARRAY) { + result->SetTypeLength(element->type_length); + if (params.logical_type == LogicalType::DECIMAL) { + result->SetDecimalMetadata(element->scale, element->precision); + } + } + + // Return as unique_ptr to the base type + return std::unique_ptr<Node>(result.release()); +} + +} // namespace schema + +} // namespace parquet_cpp http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/types.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/types.h b/src/parquet/schema/types.h new file mode 100644 index 0000000..82db233 --- /dev/null +++ b/src/parquet/schema/types.h @@ -0,0 +1,303 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module contains the logical parquet-cpp types (independent of Thrift +// structures), schema nodes, and related type tools + +#ifndef PARQUET_SCHEMA_TYPES_H +#define PARQUET_SCHEMA_TYPES_H + +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +#include "parquet/exception.h" +#include "parquet/types.h" +#include "parquet/util/macros.h" + +namespace parquet_cpp { + +namespace schema { + +// List encodings: using the terminology from Impala to define different styles +// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since +// the converted type named in the Parquet metadata is ConvertedType::LIST we +// use that terminology here. It also helps distinguish from the *_ARRAY +// primitive types. +// +// One-level encoding: Only allows required lists with required cells +// repeated value_type name +// +// Two-level encoding: Enables optional lists with only required cells +// <required/optional> group list +// repeated value_type item +// +// Three-level encoding: Enables optional lists with optional cells +// <required/optional> group bag +// repeated group list +// <required/optional> value_type item +// +// 2- and 1-level encoding are respectively equivalent to 3-level encoding with +// the non-repeated nodes set to required. +// +// The "official" encoding recommended in the Parquet spec is the 3-level, and +// we use that as the default when creating list types. For semantic completeness +// we allow the other two. Since all types of encodings will occur "in the +// wild" we need to be able to interpret the associated definition levels in +// the context of the actual encoding used in the file. +// +// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated +// SchemaElement, which could make things challenging if we are trying to infer +// that a sequence of nodes semantically represents an array according to one +// of these encodings (versus a struct containing an array). We should refuse +// the temptation to guess, as they say. +struct ListEncoding { + enum type { + ONE_LEVEL, + TWO_LEVEL, + THREE_LEVEL + }; +}; + +struct DecimalMetadata { + int32_t scale; + int32_t precision; +}; + +// Base class for logical schema types. A type has a name, repetition level, +// and optionally a logical type (ConvertedType in Parquet metadata parlance) +class Node { + public: + enum type { + PRIMITIVE, + GROUP + }; + + Node(Node::type type, const std::string& name, + Repetition::type repetition, + LogicalType::type logical_type = LogicalType::NONE, + int id = -1) : + type_(type), + name_(name), + repetition_(repetition), + logical_type_(logical_type), + id_(id) {} + + virtual ~Node() {} + + bool is_primitive() const { + return type_ == Node::PRIMITIVE; + } + + bool is_group() const { + return type_ == Node::GROUP; + } + + bool is_optional() const { + return repetition_ == Repetition::OPTIONAL; + } + + bool is_repeated() const { + return repetition_ == Repetition::REPEATED; + } + + bool is_required() const { + return repetition_ == Repetition::REQUIRED; + } + + virtual bool Equals(const Node* other) const = 0; + + const std::string& name() const { + return name_; + } + + Node::type node_type() const { + return type_; + } + + Repetition::type repetition() const { + return repetition_; + } + + LogicalType::type logical_type() const { + return logical_type_; + } + + int id() const { + return id_; + } + + // Node::Visitor abstract class for walking schemas with the visitor pattern + class Visitor { + public: + virtual ~Visitor() {} + + virtual void Visit(const Node* node) = 0; + }; + + virtual void Visit(Visitor* visitor) = 0; + + protected: + Node::type type_; + std::string name_; + Repetition::type repetition_; + LogicalType::type logical_type_; + int id_; + + bool EqualsInternal(const Node* other) const; +}; + +// Save our breath all over the place with these typedefs +typedef std::shared_ptr<Node> NodePtr; +typedef std::vector<NodePtr> NodeVector; + +// A type that is one of the primitive Parquet storage types. In addition to +// the other type metadata (name, repetition level, logical type), also has the +// physical storage type and their type-specific metadata (byte width, decimal +// parameters) +class PrimitiveNode : public Node { + public: + // FromParquet accepts an opaque void* to avoid exporting + // parquet::SchemaElement into the public API + static std::unique_ptr<Node> FromParquet(const void* opaque_element, int id); + + static inline NodePtr Make(const std::string& name, + Repetition::type repetition, Type::type type, + LogicalType::type logical_type = LogicalType::NONE) { + return NodePtr(new PrimitiveNode(name, repetition, type, logical_type)); + } + + // Alternate constructor for FIXED_LEN_BYTE_ARRAY (FLBA) + static inline NodePtr MakeFLBA(const std::string& name, + Repetition::type repetition, Type::type type, + int32_t type_length, + LogicalType::type logical_type = LogicalType::NONE) { + NodePtr result = Make(name, repetition, type, logical_type); + static_cast<PrimitiveNode*>(result.get())->SetTypeLength(type_length); + return result; + } + + virtual bool Equals(const Node* other) const; + + Type::type physical_type() const { + return physical_type_; + } + + int32_t type_length() const { + return type_length_; + } + + const DecimalMetadata& decimal_metadata() const { + return decimal_metadata_; + } + + virtual void Visit(Visitor* visitor); + + private: + PrimitiveNode(const std::string& name, Repetition::type repetition, + Type::type type, + LogicalType::type logical_type = LogicalType::NONE, + int id = -1) : + Node(Node::PRIMITIVE, name, repetition, logical_type, id), + physical_type_(type) {} + + Type::type physical_type_; + int32_t type_length_; + DecimalMetadata decimal_metadata_; + + // For FIXED_LEN_BYTE_ARRAY + void SetTypeLength(int32_t length) { + type_length_ = length; + } + + + // For Decimal logical type: Precision and scale + void SetDecimalMetadata(int32_t scale, int32_t precision) { + decimal_metadata_.scale = scale; + decimal_metadata_.precision = precision; + } + + bool EqualsInternal(const PrimitiveNode* other) const; + + FRIEND_TEST(TestPrimitiveNode, Attrs); + FRIEND_TEST(TestPrimitiveNode, Equals); + FRIEND_TEST(TestPrimitiveNode, FromParquet); +}; + +class GroupNode : public Node { + public: + // Like PrimitiveNode, GroupNode::FromParquet accepts an opaque void* to avoid exporting + // parquet::SchemaElement into the public API + static std::unique_ptr<Node> FromParquet(const void* opaque_element, int id, + const NodeVector& fields); + + static inline NodePtr Make(const std::string& name, + Repetition::type repetition, const NodeVector& fields, + LogicalType::type logical_type = LogicalType::NONE) { + return NodePtr(new GroupNode(name, repetition, fields, logical_type)); + } + + virtual bool Equals(const Node* other) const; + + const NodePtr& field(size_t i) const { + return fields_[i]; + } + + size_t field_count() const { + return fields_.size(); + } + + virtual void Visit(Visitor* visitor); + + private: + GroupNode(const std::string& name, Repetition::type repetition, + const NodeVector& fields, + LogicalType::type logical_type = LogicalType::NONE, + int id = -1) : + Node(Node::GROUP, name, repetition, logical_type, id), + fields_(fields) {} + + NodeVector fields_; + bool EqualsInternal(const GroupNode* other) const; + + FRIEND_TEST(TestGroupNode, Attrs); + FRIEND_TEST(TestGroupNode, Equals); +}; + +// ---------------------------------------------------------------------- +// Convenience primitive type factory functions + +#define PRIMITIVE_FACTORY(FuncName, TYPE) \ + static inline NodePtr FuncName(const std::string& name, \ + Repetition::type repetition = Repetition::OPTIONAL) { \ + return PrimitiveNode::Make(name, repetition, Type::TYPE); \ + } + +PRIMITIVE_FACTORY(Boolean, BOOLEAN); +PRIMITIVE_FACTORY(Int32, INT32); +PRIMITIVE_FACTORY(Int64, INT64); +PRIMITIVE_FACTORY(Int96, INT96); +PRIMITIVE_FACTORY(Float, FLOAT); +PRIMITIVE_FACTORY(Double, DOUBLE); +PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY); + +} // namespace schema + +} // namespace parquet_cpp + +#endif // PARQUET_SCHEMA_TYPES_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/types.h ---------------------------------------------------------------------- diff --git a/src/parquet/types.h b/src/parquet/types.h index f39e3a2..2d15cad 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -24,11 +24,110 @@ #include <sstream> #include <string> -#include "parquet/thrift/parquet_types.h" #include "parquet/util/compiler-util.h" namespace parquet_cpp { +// ---------------------------------------------------------------------- +// Metadata enums to match Thrift metadata +// +// The reason we maintain our own enums is to avoid transitive dependency on +// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the +// public API. After building parquet-cpp, you should not need to include +// Thrift headers in your application. This means some boilerplate to convert +// between our types and Parquet's Thrift types. +// +// We can also add special values like NONE to distinguish between metadata +// values being set and not set. As an example consider ConvertedType and +// CompressionCodec + +// Mirrors parquet::Type +struct Type { + enum type { + BOOLEAN = 0, + INT32 = 1, + INT64 = 2, + INT96 = 3, + FLOAT = 4, + DOUBLE = 5, + BYTE_ARRAY = 6, + FIXED_LEN_BYTE_ARRAY = 7 + }; +}; + +// Mirrors parquet::ConvertedType +struct LogicalType { + enum type { + NONE, + UTF8, + MAP, + MAP_KEY_VALUE, + LIST, + ENUM, + DECIMAL, + DATE, + TIME_MILLIS, + TIMESTAMP_MILLIS, + UINT_8, + UINT_16, + UINT_32, + UINT_64, + INT_8, + INT_16, + INT_32, + INT_64, + JSON, + BSON, + INTERVAL + }; +}; + +// Mirrors parquet::FieldRepetitionType +struct Repetition { + enum type { + REQUIRED = 0, + OPTIONAL = 1, + REPEATED = 2 + }; +}; + +// Data encodings. Mirrors parquet::Encoding +struct Encoding { + enum type { + PLAIN = 0, + PLAIN_DICTIONARY = 2, + RLE = 3, + BIT_PACKED = 4, + DELTA_BINARY_PACKED = 5, + DELTA_LENGTH_BYTE_ARRAY = 6, + DELTA_BYTE_ARRAY = 7, + RLE_DICTIONARY = 8 + }; +}; + +// Compression, mirrors parquet::CompressionCodec +struct Compression { + enum type { + NONE, + UNCOMPRESSED, + SNAPPY, + GZIP, + LZO + }; +}; + +// parquet::PageType +struct PageType { + enum type { + DATA_PAGE, + INDEX_PAGE, + DICTIONARY_PAGE, + DATA_PAGE_V2 + }; +}; + +// ---------------------------------------------------------------------- + struct ByteArray { uint32_t len; const uint8_t* ptr; @@ -80,72 +179,64 @@ struct type_traits { }; template <> -struct type_traits<parquet::Type::BOOLEAN> { +struct type_traits<Type::BOOLEAN> { typedef bool value_type; - static constexpr parquet::Type::type parquet_type = parquet::Type::BOOLEAN; static constexpr size_t value_byte_size = 1; static constexpr const char* printf_code = "d"; }; template <> -struct type_traits<parquet::Type::INT32> { +struct type_traits<Type::INT32> { typedef int32_t value_type; - static constexpr parquet::Type::type parquet_type = parquet::Type::INT32; static constexpr size_t value_byte_size = 4; static constexpr const char* printf_code = "d"; }; template <> -struct type_traits<parquet::Type::INT64> { +struct type_traits<Type::INT64> { typedef int64_t value_type; - static constexpr parquet::Type::type parquet_type = parquet::Type::INT64; static constexpr size_t value_byte_size = 8; static constexpr const char* printf_code = "ld"; }; template <> -struct type_traits<parquet::Type::INT96> { +struct type_traits<Type::INT96> { typedef Int96 value_type; - static constexpr parquet::Type::type parquet_type = parquet::Type::INT96; static constexpr size_t value_byte_size = 12; static constexpr const char* printf_code = "s"; }; template <> -struct type_traits<parquet::Type::FLOAT> { +struct type_traits<Type::FLOAT> { typedef float value_type; - static constexpr parquet::Type::type parquet_type = parquet::Type::FLOAT; static constexpr size_t value_byte_size = 4; static constexpr const char* printf_code = "f"; }; template <> -struct type_traits<parquet::Type::DOUBLE> { +struct type_traits<Type::DOUBLE> { typedef double value_type; - static constexpr parquet::Type::type parquet_type = parquet::Type::DOUBLE; static constexpr size_t value_byte_size = 8; static constexpr const char* printf_code = "lf"; }; template <> -struct type_traits<parquet::Type::BYTE_ARRAY> { +struct type_traits<Type::BYTE_ARRAY> { typedef ByteArray value_type; - static constexpr parquet::Type::type parquet_type = parquet::Type::BYTE_ARRAY; static constexpr size_t value_byte_size = sizeof(ByteArray); static constexpr const char* printf_code = "s"; }; template <> -struct type_traits<parquet::Type::FIXED_LEN_BYTE_ARRAY> { +struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> { typedef FixedLenByteArray value_type; - static constexpr parquet::Type::type parquet_type = parquet::Type::FIXED_LEN_BYTE_ARRAY; static constexpr size_t value_byte_size = sizeof(FixedLenByteArray); static constexpr const char* printf_code = "s"; @@ -158,6 +249,38 @@ inline std::string format_fwf(int width) { return ss.str(); } +static inline std::string type_to_string(Type::type t) { + switch (t) { + case Type::BOOLEAN: + return "BOOLEAN"; + break; + case Type::INT32: + return "INT32"; + break; + case Type::INT64: + return "INT64"; + break; + case Type::INT96: + return "INT96"; + break; + case Type::FLOAT: + return "FLOAT"; + break; + case Type::DOUBLE: + return "DOUBLE"; + break; + case Type::BYTE_ARRAY: + return "BYTE_ARRAY"; + break; + case Type::FIXED_LEN_BYTE_ARRAY: + return "FIXED_LEN_BYTE_ARRAY"; + break; + default: + return "UNKNOWN"; + break; + } +} + } // namespace parquet_cpp #endif // PARQUET_TYPES_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/util/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/src/parquet/util/CMakeLists.txt b/src/parquet/util/CMakeLists.txt index 1c86112..90a053f 100644 --- a/src/parquet/util/CMakeLists.txt +++ b/src/parquet/util/CMakeLists.txt @@ -24,6 +24,7 @@ install(FILES sse-info.h compiler-util.h logging.h + macros.h rle-encoding.h stopwatch.h input_stream.h http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/util/macros.h ---------------------------------------------------------------------- diff --git a/src/parquet/util/macros.h b/src/parquet/util/macros.h new file mode 100644 index 0000000..7b301d6 --- /dev/null +++ b/src/parquet/util/macros.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_UTIL_MACROS_H +#define PARQUET_UTIL_MACROS_H + +// Useful macros from elsewhere + +// ---------------------------------------------------------------------- +// From googletest + +// When you need to test the private or protected members of a class, +// use the FRIEND_TEST macro to declare your tests as friends of the +// class. For example: +// +// class MyClass { +// private: +// void MyMethod(); +// FRIEND_TEST(MyClassTest, MyMethod); +// }; +// +// class MyClassTest : public testing::Test { +// // ... +// }; +// +// TEST_F(MyClassTest, MyMethod) { +// // Can call MyClass::MyMethod() here. +// } + +#define FRIEND_TEST(test_case_name, test_name)\ +friend class test_case_name##_##test_name##_Test + +#endif // PARQUET_UTIL_MACROS_H
