http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema.cc b/src/parquet/schema.cc new file mode 100644 index 0000000..13fca68 --- /dev/null +++ b/src/parquet/schema.cc @@ -0,0 +1,655 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/schema.h" +#include "parquet/schema-internal.h" + +#include <algorithm> +#include <memory> + +#include "parquet/exception.h" +#include "parquet/thrift/parquet_types.h" +#include "parquet/thrift/util.h" + +using parquet::format::SchemaElement; + +namespace parquet { + +namespace schema { + +// ---------------------------------------------------------------------- +// ColumnPath + +std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const std::string& dotstring) { + std::stringstream ss(dotstring); + std::string item; + std::vector<std::string> path; + while (std::getline(ss, item, '.')) { + path.push_back(item); + } + return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path))); +} + +std::shared_ptr<ColumnPath> ColumnPath::extend(const std::string& node_name) const { + std::vector<std::string> path; + path.reserve(path_.size() + 1); + path.resize(path_.size() + 1); + std::copy(path_.cbegin(), path_.cend(), path.begin()); + path[path_.size()] = node_name; + + return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path))); +} + +std::string ColumnPath::ToDotString() const { + std::stringstream ss; + for (auto it = path_.cbegin(); it != path_.cend(); ++it) { + if (it != path_.cbegin()) { ss << "."; } + ss << *it; + } + return ss.str(); +} + +const std::vector<std::string>& ColumnPath::ToDotVector() const { + return path_; +} + +// ---------------------------------------------------------------------- +// Base node + +bool Node::EqualsInternal(const Node* other) const { + return type_ == other->type_ && name_ == other->name_ && + repetition_ == other->repetition_ && logical_type_ == other->logical_type_; +} + +void Node::SetParent(const Node* parent) { + parent_ = parent; +} + +// ---------------------------------------------------------------------- +// Primitive node + +PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition, + Type::type type, LogicalType::type logical_type, int length, int precision, int scale, + int id) + : Node(Node::PRIMITIVE, name, repetition, logical_type, id), + physical_type_(type), + type_length_(length) { + std::stringstream ss; + + // PARQUET-842: In an earlier revision, decimal_metadata_.isset was being + // set to true, but Impala will raise an incompatible metadata in such cases + memset(&decimal_metadata_, 0, sizeof(decimal_metadata_)); + + // Check if the physical and logical types match + // Mapping referred from Apache parquet-mr as on 2016-02-22 + switch (logical_type) { + case LogicalType::NONE: + // Logical type not set + break; + case LogicalType::UTF8: + case LogicalType::JSON: + case LogicalType::BSON: + if (type != Type::BYTE_ARRAY) { + ss << LogicalTypeToString(logical_type); + ss << " can only annotate BYTE_ARRAY fields"; + throw ParquetException(ss.str()); + } + break; + case LogicalType::DECIMAL: + if ((type != Type::INT32) && (type != Type::INT64) && (type != Type::BYTE_ARRAY) && + (type != Type::FIXED_LEN_BYTE_ARRAY)) { + ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED"; + throw ParquetException(ss.str()); + } + if (precision <= 0) { + ss << "Invalid DECIMAL precision: " << precision; + throw ParquetException(ss.str()); + } + if (scale < 0) { + ss << "Invalid DECIMAL scale: " << scale; + throw ParquetException(ss.str()); + } + if (scale > precision) { + ss << "Invalid DECIMAL scale " << scale; + ss << " cannot be greater than precision " << precision; + throw ParquetException(ss.str()); + } + decimal_metadata_.isset = true; + decimal_metadata_.precision = precision; + decimal_metadata_.scale = scale; + break; + case LogicalType::DATE: + case LogicalType::TIME_MILLIS: + case LogicalType::UINT_8: + case LogicalType::UINT_16: + case LogicalType::UINT_32: + case LogicalType::INT_8: + case LogicalType::INT_16: + case LogicalType::INT_32: + if (type != Type::INT32) { + ss << LogicalTypeToString(logical_type); + ss << " can only annotate INT32"; + throw ParquetException(ss.str()); + } + break; + case LogicalType::TIME_MICROS: + case LogicalType::TIMESTAMP_MILLIS: + case LogicalType::TIMESTAMP_MICROS: + case LogicalType::UINT_64: + case LogicalType::INT_64: + if (type != Type::INT64) { + ss << LogicalTypeToString(logical_type); + ss << " can only annotate INT64"; + throw ParquetException(ss.str()); + } + break; + case LogicalType::INTERVAL: + if ((type != Type::FIXED_LEN_BYTE_ARRAY) || (length != 12)) { + ss << "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)"; + throw ParquetException(ss.str()); + } + break; + case LogicalType::ENUM: + if (type != Type::BYTE_ARRAY) { + ss << "ENUM can only annotate BYTE_ARRAY fields"; + throw ParquetException(ss.str()); + } + break; + default: + ss << LogicalTypeToString(logical_type); + ss << " can not be applied to a primitive type"; + throw ParquetException(ss.str()); + } + if (type == Type::FIXED_LEN_BYTE_ARRAY) { + if (length <= 0) { + ss << "Invalid FIXED_LEN_BYTE_ARRAY length: " << length; + throw ParquetException(ss.str()); + } + type_length_ = length; + } +} + +bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const { + bool is_equal = true; + if ((physical_type_ != other->physical_type_) || + (logical_type_ != other->logical_type_)) { + return false; + } + if (logical_type_ == LogicalType::DECIMAL) { + is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision) && + (decimal_metadata_.scale == other->decimal_metadata_.scale); + } + if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) { + is_equal &= (type_length_ == other->type_length_); + } + return is_equal; +} + +bool PrimitiveNode::Equals(const Node* other) const { + if (!Node::EqualsInternal(other)) { return false; } + return EqualsInternal(static_cast<const PrimitiveNode*>(other)); +} + +void PrimitiveNode::Visit(Node::Visitor* visitor) { + visitor->Visit(this); +} + +void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const { + visitor->Visit(this); +} + +// ---------------------------------------------------------------------- +// Group node + +bool GroupNode::EqualsInternal(const GroupNode* other) const { + if (this == other) { return true; } + if (this->field_count() != other->field_count()) { return false; } + for (int i = 0; i < this->field_count(); ++i) { + if (!this->field(i)->Equals(other->field(i).get())) { return false; } + } + return true; +} + +bool GroupNode::Equals(const Node* other) const { + if (!Node::EqualsInternal(other)) { return false; } + return EqualsInternal(static_cast<const GroupNode*>(other)); +} + +void GroupNode::Visit(Node::Visitor* visitor) { + visitor->Visit(this); +} + +void GroupNode::VisitConst(Node::ConstVisitor* visitor) const { + visitor->Visit(this); +} + +// ---------------------------------------------------------------------- +// Node construction from Parquet metadata + +struct NodeParams { + explicit NodeParams(const std::string& name) : name(name) {} + + const std::string& name; + Repetition::type repetition; + LogicalType::type logical_type; +}; + +static inline NodeParams GetNodeParams(const format::SchemaElement* element) { + NodeParams params(element->name); + + params.repetition = FromThrift(element->repetition_type); + if (element->__isset.converted_type) { + params.logical_type = FromThrift(element->converted_type); + } else { + params.logical_type = LogicalType::NONE; + } + return params; +} + +std::unique_ptr<Node> GroupNode::FromParquet( + const void* opaque_element, int node_id, const NodeVector& fields) { + const format::SchemaElement* element = + static_cast<const format::SchemaElement*>(opaque_element); + NodeParams params = GetNodeParams(element); + return std::unique_ptr<Node>(new GroupNode( + params.name, params.repetition, fields, params.logical_type, node_id)); +} + +std::unique_ptr<Node> PrimitiveNode::FromParquet( + const void* opaque_element, int node_id) { + const format::SchemaElement* element = + static_cast<const format::SchemaElement*>(opaque_element); + NodeParams params = GetNodeParams(element); + + std::unique_ptr<PrimitiveNode> result = + std::unique_ptr<PrimitiveNode>(new PrimitiveNode(params.name, params.repetition, + FromThrift(element->type), params.logical_type, element->type_length, + element->precision, element->scale, node_id)); + + // Return as unique_ptr to the base type + return std::unique_ptr<Node>(result.release()); +} + +void GroupNode::ToParquet(void* opaque_element) const { + format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element); + element->__set_name(name_); + element->__set_num_children(field_count()); + element->__set_repetition_type(ToThrift(repetition_)); + if (logical_type_ != LogicalType::NONE) { + element->__set_converted_type(ToThrift(logical_type_)); + } +} + +void PrimitiveNode::ToParquet(void* opaque_element) const { + format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element); + + element->__set_name(name_); + element->__set_num_children(0); + element->__set_repetition_type(ToThrift(repetition_)); + if (logical_type_ != LogicalType::NONE) { + element->__set_converted_type(ToThrift(logical_type_)); + } + element->__set_type(ToThrift(physical_type_)); + if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) { + element->__set_type_length(type_length_); + } + if (decimal_metadata_.isset) { + element->__set_precision(decimal_metadata_.precision); + element->__set_scale(decimal_metadata_.scale); + } +} + +// ---------------------------------------------------------------------- +// Schema converters + +std::unique_ptr<Node> FlatSchemaConverter::Convert() { + const SchemaElement& root = elements_[0]; + + // Validate the root node + if (root.num_children == 0) { + throw ParquetException("Root node did not have children"); + } + + // Relaxing this restriction as some implementations don't set this + // if (root.repetition_type != FieldRepetitionType::REPEATED) { + // throw ParquetException("Root node was not FieldRepetitionType::REPEATED"); + // } + + return NextNode(); +} + +std::unique_ptr<Node> FlatSchemaConverter::NextNode() { + const SchemaElement& element = Next(); + + int node_id = next_id(); + + const void* opaque_element = static_cast<const void*>(&element); + + if (element.num_children == 0) { + // Leaf (primitive) node + return PrimitiveNode::FromParquet(opaque_element, node_id); + } else { + // Group + NodeVector fields; + for (int i = 0; i < element.num_children; ++i) { + std::unique_ptr<Node> field = NextNode(); + fields.push_back(NodePtr(field.release())); + } + return GroupNode::FromParquet(opaque_element, node_id, fields); + } +} + +const format::SchemaElement& FlatSchemaConverter::Next() { + if (pos_ == length_) { + throw ParquetException("Malformed schema: not enough SchemaElement values"); + } + return elements_[pos_++]; +} + +std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>& schema) { + FlatSchemaConverter converter(&schema[0], schema.size()); + std::unique_ptr<Node> root = converter.Convert(); + + std::shared_ptr<SchemaDescriptor> descr = std::make_shared<SchemaDescriptor>(); + descr->Init(std::shared_ptr<GroupNode>(static_cast<GroupNode*>(root.release()))); + + return descr; +} + +void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out) { + SchemaFlattener flattener(schema, out); + flattener.Flatten(); +} + +class SchemaVisitor : public Node::ConstVisitor { + public: + explicit SchemaVisitor(std::vector<format::SchemaElement>* elements) + : elements_(elements) {} + virtual ~SchemaVisitor() {} + + void Visit(const Node* node) override { + format::SchemaElement element; + node->ToParquet(&element); + elements_->push_back(element); + + if (node->is_group()) { + const GroupNode* group_node = static_cast<const GroupNode*>(node); + for (int i = 0; i < group_node->field_count(); ++i) { + group_node->field(i)->VisitConst(this); + } + } + } + + private: + std::vector<format::SchemaElement>* elements_; +}; + +SchemaFlattener::SchemaFlattener( + const GroupNode* schema, std::vector<format::SchemaElement>* out) + : root_(schema), elements_(out) {} + +void SchemaFlattener::Flatten() { + SchemaVisitor visitor(elements_); + root_->VisitConst(&visitor); +} + +// ---------------------------------------------------------------------- +// Schema printing + +class SchemaPrinter : public Node::ConstVisitor { + public: + explicit SchemaPrinter(std::ostream& stream, int indent_width) + : stream_(stream), indent_(0), indent_width_(2) {} + + void Visit(const Node* node) override; + + private: + void Visit(const PrimitiveNode* node); + void Visit(const GroupNode* node); + + void Indent(); + + std::ostream& stream_; + + int indent_; + int indent_width_; +}; + +static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) { + switch (repetition) { + case Repetition::REQUIRED: + stream << "required"; + break; + case Repetition::OPTIONAL: + stream << "optional"; + break; + case Repetition::REPEATED: + stream << "repeated"; + break; + default: + break; + } +} + +static void PrintType(const PrimitiveNode* node, std::ostream& stream) { + switch (node->physical_type()) { + case Type::BOOLEAN: + stream << "boolean"; + break; + case Type::INT32: + stream << "int32"; + break; + case Type::INT64: + stream << "int64"; + break; + case Type::INT96: + stream << "int96"; + break; + case Type::FLOAT: + stream << "float"; + break; + case Type::DOUBLE: + stream << "double"; + break; + case Type::BYTE_ARRAY: + stream << "binary"; + break; + case Type::FIXED_LEN_BYTE_ARRAY: + stream << "fixed_len_byte_array(" << node->type_length() << ")"; + break; + default: + break; + } +} + +static void PrintLogicalType(const PrimitiveNode* node, std::ostream& stream) { + auto lt = node->logical_type(); + if (lt == LogicalType::DECIMAL) { + stream << " (" << LogicalTypeToString(lt) << "(" << node->decimal_metadata().precision + << "," << node->decimal_metadata().scale << "))"; + } else if (lt != LogicalType::NONE) { + stream << " (" << LogicalTypeToString(lt) << ")"; + } +} + +void SchemaPrinter::Visit(const PrimitiveNode* node) { + PrintRepLevel(node->repetition(), stream_); + stream_ << " "; + PrintType(node, stream_); + stream_ << " " << node->name(); + PrintLogicalType(node, stream_); + stream_ << ";" << std::endl; +} + +void SchemaPrinter::Visit(const GroupNode* node) { + if (!node->parent()) { + stream_ << "message " << node->name() << " {" << std::endl; + } else { + PrintRepLevel(node->repetition(), stream_); + stream_ << " group " << node->name(); + auto lt = node->logical_type(); + if (lt != LogicalType::NONE) { stream_ << " (" << LogicalTypeToString(lt) << ")"; } + stream_ << " {" << std::endl; + } + + indent_ += indent_width_; + for (int i = 0; i < node->field_count(); ++i) { + node->field(i)->VisitConst(this); + } + indent_ -= indent_width_; + Indent(); + stream_ << "}" << std::endl; +} + +void SchemaPrinter::Indent() { + if (indent_ > 0) { + std::string spaces(indent_, ' '); + stream_ << spaces; + } +} + +void SchemaPrinter::Visit(const Node* node) { + Indent(); + if (node->is_group()) { + Visit(static_cast<const GroupNode*>(node)); + } else { + // Primitive + Visit(static_cast<const PrimitiveNode*>(node)); + } +} + +void PrintSchema(const Node* schema, std::ostream& stream, int indent_width) { + SchemaPrinter printer(stream, indent_width); + printer.Visit(schema); +} + +} // namespace schema + +using schema::ColumnPath; +using schema::Node; +using schema::NodePtr; +using schema::PrimitiveNode; +using schema::GroupNode; + +void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema) { + Init(NodePtr(schema.release())); +} + +void SchemaDescriptor::Init(const NodePtr& schema) { + schema_ = schema; + + if (!schema_->is_group()) { + throw ParquetException("Must initialize with a schema group"); + } + + group_node_ = static_cast<const GroupNode*>(schema_.get()); + leaves_.clear(); + + for (int i = 0; i < group_node_->field_count(); ++i) { + BuildTree(group_node_->field(i), 0, 0, group_node_->field(i)); + } +} + +bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const { + if (this->num_columns() != other.num_columns()) { return false; } + + for (int i = 0; i < this->num_columns(); ++i) { + if (!this->Column(i)->Equals(*other.Column(i))) { return false; } + } + + return true; +} + +void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level, + int16_t max_rep_level, const NodePtr& base) { + if (node->is_optional()) { + ++max_def_level; + } else if (node->is_repeated()) { + // Repeated fields add a definition level. This is used to distinguish + // between an empty list and a list with an item in it. + ++max_rep_level; + ++max_def_level; + } + + // Now, walk the schema and create a ColumnDescriptor for each leaf node + if (node->is_group()) { + const GroupNode* group = static_cast<const GroupNode*>(node.get()); + for (int i = 0; i < group->field_count(); ++i) { + BuildTree(group->field(i), max_def_level, max_rep_level, base); + } + } else { + // Primitive node, append to leaves + leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this)); + leaf_to_base_.emplace(leaves_.size() - 1, base); + } +} + +ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node, + int16_t max_definition_level, int16_t max_repetition_level, + const SchemaDescriptor* schema_descr) + : node_(node), + max_definition_level_(max_definition_level), + max_repetition_level_(max_repetition_level), + schema_descr_(schema_descr) { + if (!node_->is_primitive()) { throw ParquetException("Must be a primitive type"); } + primitive_node_ = static_cast<const PrimitiveNode*>(node_.get()); +} + +bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const { + return primitive_node_->Equals(other.primitive_node_) && + max_repetition_level() == other.max_repetition_level() && + max_definition_level() == other.max_definition_level(); +} + +const ColumnDescriptor* SchemaDescriptor::Column(int i) const { + DCHECK(i >= 0 && i < static_cast<int>(leaves_.size())); + return &leaves_[i]; +} + +const schema::NodePtr& SchemaDescriptor::GetColumnRoot(int i) const { + DCHECK(i >= 0 && i < static_cast<int>(leaves_.size())); + return leaf_to_base_.find(i)->second; +} + +int ColumnDescriptor::type_scale() const { + return primitive_node_->decimal_metadata().scale; +} + +int ColumnDescriptor::type_precision() const { + return primitive_node_->decimal_metadata().precision; +} + +int ColumnDescriptor::type_length() const { + return primitive_node_->type_length(); +} + +const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const { + // Build the path in reverse order as we traverse the nodes to the top + std::vector<std::string> rpath_; + const Node* node = primitive_node_; + // The schema node is not part of the ColumnPath + while (node->parent()) { + rpath_.push_back(node->name()); + node = node->parent(); + } + + // Build ColumnPath in correct order + std::vector<std::string> path_(rpath_.crbegin(), rpath_.crend()); + return std::make_shared<ColumnPath>(std::move(path_)); +} + +} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema.h b/src/parquet/schema.h new file mode 100644 index 0000000..30aea44 --- /dev/null +++ b/src/parquet/schema.h @@ -0,0 +1,405 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module contains the logical parquet-cpp types (independent of Thrift +// structures), schema nodes, and related type tools + +#ifndef PARQUET_SCHEMA_TYPES_H +#define PARQUET_SCHEMA_TYPES_H + +#include <cstdint> +#include <memory> +#include <ostream> +#include <string> +#include <unordered_map> +#include <vector> + +#include "parquet/types.h" +#include "parquet/util/macros.h" +#include "parquet/util/visibility.h" + +namespace parquet { + +class SchemaDescriptor; + +namespace schema { + +// List encodings: using the terminology from Impala to define different styles +// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since +// the converted type named in the Parquet metadata is ConvertedType::LIST we +// use that terminology here. It also helps distinguish from the *_ARRAY +// primitive types. +// +// One-level encoding: Only allows required lists with required cells +// repeated value_type name +// +// Two-level encoding: Enables optional lists with only required cells +// <required/optional> group list +// repeated value_type item +// +// Three-level encoding: Enables optional lists with optional cells +// <required/optional> group bag +// repeated group list +// <required/optional> value_type item +// +// 2- and 1-level encoding are respectively equivalent to 3-level encoding with +// the non-repeated nodes set to required. +// +// The "official" encoding recommended in the Parquet spec is the 3-level, and +// we use that as the default when creating list types. For semantic completeness +// we allow the other two. Since all types of encodings will occur "in the +// wild" we need to be able to interpret the associated definition levels in +// the context of the actual encoding used in the file. +// +// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated +// SchemaElement, which could make things challenging if we are trying to infer +// that a sequence of nodes semantically represents an array according to one +// of these encodings (versus a struct containing an array). We should refuse +// the temptation to guess, as they say. +struct ListEncoding { + enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL }; +}; + +struct DecimalMetadata { + bool isset; + int32_t scale; + int32_t precision; +}; + +class PARQUET_EXPORT ColumnPath { + public: + ColumnPath() : path_() {} + explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {} + explicit ColumnPath(std::vector<std::string>&& path) : path_(path) {} + + static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring); + + std::shared_ptr<ColumnPath> extend(const std::string& node_name) const; + std::string ToDotString() const; + const std::vector<std::string>& ToDotVector() const; + + protected: + std::vector<std::string> path_; +}; + +class GroupNode; + +// Base class for logical schema types. A type has a name, repetition level, +// and optionally a logical type (ConvertedType in Parquet metadata parlance) +class PARQUET_EXPORT Node { + public: + enum type { PRIMITIVE, GROUP }; + + Node(Node::type type, const std::string& name, Repetition::type repetition, + LogicalType::type logical_type = LogicalType::NONE, int id = -1) + : type_(type), + name_(name), + repetition_(repetition), + logical_type_(logical_type), + id_(id), + parent_(nullptr) {} + + virtual ~Node() {} + + bool is_primitive() const { return type_ == Node::PRIMITIVE; } + + bool is_group() const { return type_ == Node::GROUP; } + + bool is_optional() const { return repetition_ == Repetition::OPTIONAL; } + + bool is_repeated() const { return repetition_ == Repetition::REPEATED; } + + bool is_required() const { return repetition_ == Repetition::REQUIRED; } + + virtual bool Equals(const Node* other) const = 0; + + const std::string& name() const { return name_; } + + Node::type node_type() const { return type_; } + + Repetition::type repetition() const { return repetition_; } + + LogicalType::type logical_type() const { return logical_type_; } + + int id() const { return id_; } + + const Node* parent() const { return parent_; } + + // ToParquet returns an opaque void* to avoid exporting + // parquet::SchemaElement into the public API + virtual void ToParquet(void* opaque_element) const = 0; + + // Node::Visitor abstract class for walking schemas with the visitor pattern + class Visitor { + public: + virtual ~Visitor() {} + + virtual void Visit(Node* node) = 0; + }; + class ConstVisitor { + public: + virtual ~ConstVisitor() {} + + virtual void Visit(const Node* node) = 0; + }; + + virtual void Visit(Visitor* visitor) = 0; + virtual void VisitConst(ConstVisitor* visitor) const = 0; + + protected: + friend class GroupNode; + + Node::type type_; + std::string name_; + Repetition::type repetition_; + LogicalType::type logical_type_; + int id_; + // Nodes should not be shared, they have a single parent. + const Node* parent_; + + bool EqualsInternal(const Node* other) const; + void SetParent(const Node* p_parent); +}; + +// Save our breath all over the place with these typedefs +typedef std::shared_ptr<Node> NodePtr; +typedef std::vector<NodePtr> NodeVector; + +// A type that is one of the primitive Parquet storage types. In addition to +// the other type metadata (name, repetition level, logical type), also has the +// physical storage type and their type-specific metadata (byte width, decimal +// parameters) +class PARQUET_EXPORT PrimitiveNode : public Node { + public: + // FromParquet accepts an opaque void* to avoid exporting + // parquet::SchemaElement into the public API + static std::unique_ptr<Node> FromParquet(const void* opaque_element, int id); + + static inline NodePtr Make(const std::string& name, Repetition::type repetition, + Type::type type, LogicalType::type logical_type = LogicalType::NONE, + int length = -1, int precision = -1, int scale = -1) { + return NodePtr(new PrimitiveNode( + name, repetition, type, logical_type, length, precision, scale)); + } + + bool Equals(const Node* other) const override; + + Type::type physical_type() const { return physical_type_; } + + int32_t type_length() const { return type_length_; } + + const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; } + + void ToParquet(void* opaque_element) const override; + void Visit(Visitor* visitor) override; + void VisitConst(ConstVisitor* visitor) const override; + + private: + PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type, + LogicalType::type logical_type = LogicalType::NONE, int length = -1, + int precision = -1, int scale = -1, int id = -1); + + Type::type physical_type_; + int32_t type_length_; + DecimalMetadata decimal_metadata_; + + // For FIXED_LEN_BYTE_ARRAY + void SetTypeLength(int32_t length) { type_length_ = length; } + + // For Decimal logical type: Precision and scale + void SetDecimalMetadata(int32_t scale, int32_t precision) { + decimal_metadata_.scale = scale; + decimal_metadata_.precision = precision; + } + + bool EqualsInternal(const PrimitiveNode* other) const; + + FRIEND_TEST(TestPrimitiveNode, Attrs); + FRIEND_TEST(TestPrimitiveNode, Equals); + FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping); + FRIEND_TEST(TestPrimitiveNode, FromParquet); +}; + +class PARQUET_EXPORT GroupNode : public Node { + public: + // Like PrimitiveNode, GroupNode::FromParquet accepts an opaque void* to avoid exporting + // parquet::SchemaElement into the public API + static std::unique_ptr<Node> FromParquet( + const void* opaque_element, int id, const NodeVector& fields); + + static inline NodePtr Make(const std::string& name, Repetition::type repetition, + const NodeVector& fields, LogicalType::type logical_type = LogicalType::NONE) { + return NodePtr(new GroupNode(name, repetition, fields, logical_type)); + } + + bool Equals(const Node* other) const override; + + const NodePtr& field(int i) const { return fields_[i]; } + + int field_count() const { return fields_.size(); } + + void ToParquet(void* opaque_element) const override; + void Visit(Visitor* visitor) override; + void VisitConst(ConstVisitor* visitor) const override; + + private: + GroupNode(const std::string& name, Repetition::type repetition, + const NodeVector& fields, LogicalType::type logical_type = LogicalType::NONE, + int id = -1) + : Node(Node::GROUP, name, repetition, logical_type, id), fields_(fields) { + for (NodePtr& field : fields_) { + field->SetParent(this); + } + } + + NodeVector fields_; + bool EqualsInternal(const GroupNode* other) const; + + FRIEND_TEST(TestGroupNode, Attrs); + FRIEND_TEST(TestGroupNode, Equals); +}; + +// ---------------------------------------------------------------------- +// Convenience primitive type factory functions + +#define PRIMITIVE_FACTORY(FuncName, TYPE) \ + static inline NodePtr FuncName( \ + const std::string& name, Repetition::type repetition = Repetition::OPTIONAL) { \ + return PrimitiveNode::Make(name, repetition, Type::TYPE); \ + } + +PRIMITIVE_FACTORY(Boolean, BOOLEAN); +PRIMITIVE_FACTORY(Int32, INT32); +PRIMITIVE_FACTORY(Int64, INT64); +PRIMITIVE_FACTORY(Int96, INT96); +PRIMITIVE_FACTORY(Float, FLOAT); +PRIMITIVE_FACTORY(Double, DOUBLE); +PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY); + +void PARQUET_EXPORT PrintSchema( + const schema::Node* schema, std::ostream& stream, int indent_width = 2); + +} // namespace schema + +// The ColumnDescriptor encapsulates information necessary to interpret +// primitive column data in the context of a particular schema. We have to +// examine the node structure of a column's path to the root in the schema tree +// to be able to reassemble the nested structure from the repetition and +// definition levels. +class PARQUET_EXPORT ColumnDescriptor { + public: + ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level, + int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr); + + bool Equals(const ColumnDescriptor& other) const; + + int16_t max_definition_level() const { return max_definition_level_; } + + int16_t max_repetition_level() const { return max_repetition_level_; } + + Type::type physical_type() const { return primitive_node_->physical_type(); } + + LogicalType::type logical_type() const { return primitive_node_->logical_type(); } + + const std::string& name() const { return primitive_node_->name(); } + + const std::shared_ptr<schema::ColumnPath> path() const; + + const schema::NodePtr& schema_node() const { return node_; } + + int type_length() const; + + int type_precision() const; + + int type_scale() const; + + private: + schema::NodePtr node_; + const schema::PrimitiveNode* primitive_node_; + + int16_t max_definition_level_; + int16_t max_repetition_level_; + + // When this descriptor is part of a real schema (and not being used for + // testing purposes), maintain a link back to the parent SchemaDescriptor to + // enable reverse graph traversals + const SchemaDescriptor* schema_descr_; +}; + +// Container for the converted Parquet schema with a computed information from +// the schema analysis needed for file reading +// +// * Column index to Node +// * Max repetition / definition levels for each primitive node +// +// The ColumnDescriptor objects produced by this class can be used to assist in +// the reconstruction of fully materialized data structures from the +// repetition-definition level encoding of nested data +// +// TODO(wesm): this object can be recomputed from a Schema +class PARQUET_EXPORT SchemaDescriptor { + public: + SchemaDescriptor() {} + ~SchemaDescriptor() {} + + // Analyze the schema + void Init(std::unique_ptr<schema::Node> schema); + void Init(const schema::NodePtr& schema); + + const ColumnDescriptor* Column(int i) const; + + bool Equals(const SchemaDescriptor& other) const; + + // The number of physical columns appearing in the file + int num_columns() const { return leaves_.size(); } + + const schema::NodePtr& schema_root() const { return schema_; } + + const schema::GroupNode* group_node() const { return group_node_; } + + // Returns the root (child of the schema root) node of the leaf(column) node + const schema::NodePtr& GetColumnRoot(int i) const; + + const std::string& name() const { return group_node_->name(); } + + private: + friend class ColumnDescriptor; + + schema::NodePtr schema_; + const schema::GroupNode* group_node_; + + void BuildTree(const schema::NodePtr& node, int16_t max_def_level, + int16_t max_rep_level, const schema::NodePtr& base); + + // Result of leaf node / tree analysis + std::vector<ColumnDescriptor> leaves_; + + // Mapping between leaf nodes and root group of leaf (first node + // below the schema's root group) + // + // For example, the leaf `a.b.c.d` would have a link back to `a` + // + // -- a <------ + // -- -- b | + // -- -- -- c | + // -- -- -- -- d + std::unordered_map<int, const schema::NodePtr> leaf_to_base_; +}; + +} // namespace parquet + +#endif // PARQUET_SCHEMA_TYPES_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/src/parquet/schema/CMakeLists.txt b/src/parquet/schema/CMakeLists.txt deleted file mode 100644 index 8aa9969..0000000 --- a/src/parquet/schema/CMakeLists.txt +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Headers: top level -install(FILES - descriptor.h - printer.h - types.h - DESTINATION include/parquet/schema) - -ADD_PARQUET_TEST(schema-converter-test) -ADD_PARQUET_TEST(schema-descriptor-test) -ADD_PARQUET_TEST(schema-printer-test) -ADD_PARQUET_TEST(schema-types-test) http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/converter.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/converter.cc b/src/parquet/schema/converter.cc deleted file mode 100644 index 3b18af3..0000000 --- a/src/parquet/schema/converter.cc +++ /dev/null @@ -1,124 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/schema/converter.h" - -#include "parquet/exception.h" -#include "parquet/schema/descriptor.h" -#include "parquet/schema/types.h" -#include "parquet/thrift/parquet_types.h" - -using parquet::format::SchemaElement; - -namespace parquet { - -namespace schema { - -std::unique_ptr<Node> FlatSchemaConverter::Convert() { - const SchemaElement& root = elements_[0]; - - // Validate the root node - if (root.num_children == 0) { - throw ParquetException("Root node did not have children"); - } - - // Relaxing this restriction as some implementations don't set this - // if (root.repetition_type != FieldRepetitionType::REPEATED) { - // throw ParquetException("Root node was not FieldRepetitionType::REPEATED"); - // } - - return NextNode(); -} - -std::unique_ptr<Node> FlatSchemaConverter::NextNode() { - const SchemaElement& element = Next(); - - int node_id = next_id(); - - const void* opaque_element = static_cast<const void*>(&element); - - if (element.num_children == 0) { - // Leaf (primitive) node - return PrimitiveNode::FromParquet(opaque_element, node_id); - } else { - // Group - NodeVector fields; - for (int i = 0; i < element.num_children; ++i) { - std::unique_ptr<Node> field = NextNode(); - fields.push_back(NodePtr(field.release())); - } - return GroupNode::FromParquet(opaque_element, node_id, fields); - } -} - -const format::SchemaElement& FlatSchemaConverter::Next() { - if (pos_ == length_) { - throw ParquetException("Malformed schema: not enough SchemaElement values"); - } - return elements_[pos_++]; -} - -std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>& schema) { - FlatSchemaConverter converter(&schema[0], schema.size()); - std::unique_ptr<Node> root = converter.Convert(); - - std::shared_ptr<SchemaDescriptor> descr = std::make_shared<SchemaDescriptor>(); - descr->Init(std::shared_ptr<GroupNode>(static_cast<GroupNode*>(root.release()))); - - return descr; -} - -void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out) { - SchemaFlattener flattener(schema, out); - flattener.Flatten(); -} - -class SchemaVisitor : public Node::ConstVisitor { - public: - explicit SchemaVisitor(std::vector<format::SchemaElement>* elements) - : elements_(elements) {} - virtual ~SchemaVisitor() {} - - void Visit(const Node* node) override { - format::SchemaElement element; - node->ToParquet(&element); - elements_->push_back(element); - - if (node->is_group()) { - const GroupNode* group_node = static_cast<const GroupNode*>(node); - for (int i = 0; i < group_node->field_count(); ++i) { - group_node->field(i)->VisitConst(this); - } - } - } - - private: - std::vector<format::SchemaElement>* elements_; -}; - -SchemaFlattener::SchemaFlattener( - const GroupNode* schema, std::vector<format::SchemaElement>* out) - : root_(schema), elements_(out) {} - -void SchemaFlattener::Flatten() { - SchemaVisitor visitor(elements_); - root_->VisitConst(&visitor); -} - -} // namespace schema - -} // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/converter.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/converter.h b/src/parquet/schema/converter.h deleted file mode 100644 index 617d985..0000000 --- a/src/parquet/schema/converter.h +++ /dev/null @@ -1,91 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Conversion routines for converting to and from flat Parquet metadata. Among -// other things, this limits the exposure of the internals of the Thrift -// metadata structs to the rest of the library. - -// NB: This file is not part of the schema public API and only used internally -// for converting to and from Parquet Thrift metadata - -#ifndef PARQUET_SCHEMA_CONVERTER_H -#define PARQUET_SCHEMA_CONVERTER_H - -#include <memory> -#include <vector> - -namespace parquet { - -namespace format { -class SchemaElement; -} - -class SchemaDescriptor; - -namespace schema { - -class GroupNode; -class Node; - -// ---------------------------------------------------------------------- -// Conversion from Parquet Thrift metadata - -std::shared_ptr<SchemaDescriptor> FromParquet( - const std::vector<format::SchemaElement>& schema); - -class FlatSchemaConverter { - public: - FlatSchemaConverter(const format::SchemaElement* elements, int length) - : elements_(elements), length_(length), pos_(0), current_id_(0) {} - - std::unique_ptr<Node> Convert(); - - private: - const format::SchemaElement* elements_; - int length_; - int pos_; - int current_id_; - - int next_id() { return current_id_++; } - - const format::SchemaElement& Next(); - - std::unique_ptr<Node> NextNode(); -}; - -// ---------------------------------------------------------------------- -// Conversion to Parquet Thrift metadata - -void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out); - -// Converts nested parquet schema back to a flat vector of Thrift structs -class SchemaFlattener { - public: - SchemaFlattener(const GroupNode* schema, std::vector<format::SchemaElement>* out); - - void Flatten(); - - private: - const GroupNode* root_; - std::vector<format::SchemaElement>* elements_; -}; - -} // namespace schema - -} // namespace parquet - -#endif // PARQUET_SCHEMA_CONVERTER_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/descriptor.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc deleted file mode 100644 index 0b0d006..0000000 --- a/src/parquet/schema/descriptor.cc +++ /dev/null @@ -1,138 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/schema/descriptor.h" - -#include "parquet/exception.h" -#include "parquet/util/logging.h" - -namespace parquet { - -using schema::ColumnPath; -using schema::Node; -using schema::NodePtr; -using schema::PrimitiveNode; -using schema::GroupNode; - -void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema) { - Init(NodePtr(schema.release())); -} - -void SchemaDescriptor::Init(const NodePtr& schema) { - schema_ = schema; - - if (!schema_->is_group()) { - throw ParquetException("Must initialize with a schema group"); - } - - group_node_ = static_cast<const GroupNode*>(schema_.get()); - leaves_.clear(); - - for (int i = 0; i < group_node_->field_count(); ++i) { - BuildTree(group_node_->field(i), 0, 0, group_node_->field(i)); - } -} - -bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const { - if (this->num_columns() != other.num_columns()) { return false; } - - for (int i = 0; i < this->num_columns(); ++i) { - if (!this->Column(i)->Equals(*other.Column(i))) { return false; } - } - - return true; -} - -void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level, - int16_t max_rep_level, const NodePtr& base) { - if (node->is_optional()) { - ++max_def_level; - } else if (node->is_repeated()) { - // Repeated fields add a definition level. This is used to distinguish - // between an empty list and a list with an item in it. - ++max_rep_level; - ++max_def_level; - } - - // Now, walk the schema and create a ColumnDescriptor for each leaf node - if (node->is_group()) { - const GroupNode* group = static_cast<const GroupNode*>(node.get()); - for (int i = 0; i < group->field_count(); ++i) { - BuildTree(group->field(i), max_def_level, max_rep_level, base); - } - } else { - // Primitive node, append to leaves - leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this)); - leaf_to_base_.emplace(leaves_.size() - 1, base); - } -} - -ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node, - int16_t max_definition_level, int16_t max_repetition_level, - const SchemaDescriptor* schema_descr) - : node_(node), - max_definition_level_(max_definition_level), - max_repetition_level_(max_repetition_level), - schema_descr_(schema_descr) { - if (!node_->is_primitive()) { throw ParquetException("Must be a primitive type"); } - primitive_node_ = static_cast<const PrimitiveNode*>(node_.get()); -} - -bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const { - return primitive_node_->Equals(other.primitive_node_) && - max_repetition_level() == other.max_repetition_level() && - max_definition_level() == other.max_definition_level(); -} - -const ColumnDescriptor* SchemaDescriptor::Column(int i) const { - DCHECK(i >= 0 && i < static_cast<int>(leaves_.size())); - return &leaves_[i]; -} - -const schema::NodePtr& SchemaDescriptor::GetColumnRoot(int i) const { - DCHECK(i >= 0 && i < static_cast<int>(leaves_.size())); - return leaf_to_base_.find(i)->second; -} - -int ColumnDescriptor::type_scale() const { - return primitive_node_->decimal_metadata().scale; -} - -int ColumnDescriptor::type_precision() const { - return primitive_node_->decimal_metadata().precision; -} - -int ColumnDescriptor::type_length() const { - return primitive_node_->type_length(); -} - -const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const { - // Build the path in reverse order as we traverse the nodes to the top - std::vector<std::string> rpath_; - const Node* node = primitive_node_; - // The schema node is not part of the ColumnPath - while (node->parent()) { - rpath_.push_back(node->name()); - node = node->parent(); - } - - // Build ColumnPath in correct order - std::vector<std::string> path_(rpath_.crbegin(), rpath_.crend()); - return std::make_shared<ColumnPath>(std::move(path_)); -} - -} // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/descriptor.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h deleted file mode 100644 index ae7b60e..0000000 --- a/src/parquet/schema/descriptor.h +++ /dev/null @@ -1,142 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_SCHEMA_DESCRIPTOR_H -#define PARQUET_SCHEMA_DESCRIPTOR_H - -#include "parquet/schema/types.h" -#include "parquet/types.h" -#include "parquet/util/visibility.h" -#include <cstdint> -#include <cstdlib> -#include <memory> -#include <string> -#include <unordered_map> -#include <vector> - -namespace parquet { - -class SchemaDescriptor; - -// The ColumnDescriptor encapsulates information necessary to interpret -// primitive column data in the context of a particular schema. We have to -// examine the node structure of a column's path to the root in the schema tree -// to be able to reassemble the nested structure from the repetition and -// definition levels. -class PARQUET_EXPORT ColumnDescriptor { - public: - ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level, - int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr); - - bool Equals(const ColumnDescriptor& other) const; - - int16_t max_definition_level() const { return max_definition_level_; } - - int16_t max_repetition_level() const { return max_repetition_level_; } - - Type::type physical_type() const { return primitive_node_->physical_type(); } - - LogicalType::type logical_type() const { return primitive_node_->logical_type(); } - - const std::string& name() const { return primitive_node_->name(); } - - const std::shared_ptr<schema::ColumnPath> path() const; - - const schema::NodePtr& schema_node() const { return node_; } - - int type_length() const; - - int type_precision() const; - - int type_scale() const; - - private: - schema::NodePtr node_; - const schema::PrimitiveNode* primitive_node_; - - int16_t max_definition_level_; - int16_t max_repetition_level_; - - // When this descriptor is part of a real schema (and not being used for - // testing purposes), maintain a link back to the parent SchemaDescriptor to - // enable reverse graph traversals - const SchemaDescriptor* schema_descr_; -}; - -// Container for the converted Parquet schema with a computed information from -// the schema analysis needed for file reading -// -// * Column index to Node -// * Max repetition / definition levels for each primitive node -// -// The ColumnDescriptor objects produced by this class can be used to assist in -// the reconstruction of fully materialized data structures from the -// repetition-definition level encoding of nested data -// -// TODO(wesm): this object can be recomputed from a Schema -class PARQUET_EXPORT SchemaDescriptor { - public: - SchemaDescriptor() {} - ~SchemaDescriptor() {} - - // Analyze the schema - void Init(std::unique_ptr<schema::Node> schema); - void Init(const schema::NodePtr& schema); - - const ColumnDescriptor* Column(int i) const; - - bool Equals(const SchemaDescriptor& other) const; - - // The number of physical columns appearing in the file - int num_columns() const { return leaves_.size(); } - - const schema::NodePtr& schema_root() const { return schema_; } - - const schema::GroupNode* group_node() const { return group_node_; } - - // Returns the root (child of the schema root) node of the leaf(column) node - const schema::NodePtr& GetColumnRoot(int i) const; - - const std::string& name() const { return group_node_->name(); } - - private: - friend class ColumnDescriptor; - - schema::NodePtr schema_; - const schema::GroupNode* group_node_; - - void BuildTree(const schema::NodePtr& node, int16_t max_def_level, - int16_t max_rep_level, const schema::NodePtr& base); - - // Result of leaf node / tree analysis - std::vector<ColumnDescriptor> leaves_; - - // Mapping between leaf nodes and root group of leaf (first node - // below the schema's root group) - // - // For example, the leaf `a.b.c.d` would have a link back to `a` - // - // -- a <------ - // -- -- b | - // -- -- -- c | - // -- -- -- -- d - std::unordered_map<int, const schema::NodePtr> leaf_to_base_; -}; - -} // namespace parquet - -#endif // PARQUET_SCHEMA_DESCRIPTOR_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/printer.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/printer.cc b/src/parquet/schema/printer.cc deleted file mode 100644 index ca11244..0000000 --- a/src/parquet/schema/printer.cc +++ /dev/null @@ -1,159 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/schema/printer.h" - -#include <memory> -#include <string> - -#include "parquet/schema/types.h" -#include "parquet/types.h" - -namespace parquet { - -namespace schema { - -class SchemaPrinter : public Node::ConstVisitor { - public: - explicit SchemaPrinter(std::ostream& stream, int indent_width) - : stream_(stream), indent_(0), indent_width_(2) {} - - void Visit(const Node* node) override; - - private: - void Visit(const PrimitiveNode* node); - void Visit(const GroupNode* node); - - void Indent(); - - std::ostream& stream_; - - int indent_; - int indent_width_; -}; - -static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) { - switch (repetition) { - case Repetition::REQUIRED: - stream << "required"; - break; - case Repetition::OPTIONAL: - stream << "optional"; - break; - case Repetition::REPEATED: - stream << "repeated"; - break; - default: - break; - } -} - -static void PrintType(const PrimitiveNode* node, std::ostream& stream) { - switch (node->physical_type()) { - case Type::BOOLEAN: - stream << "boolean"; - break; - case Type::INT32: - stream << "int32"; - break; - case Type::INT64: - stream << "int64"; - break; - case Type::INT96: - stream << "int96"; - break; - case Type::FLOAT: - stream << "float"; - break; - case Type::DOUBLE: - stream << "double"; - break; - case Type::BYTE_ARRAY: - stream << "binary"; - break; - case Type::FIXED_LEN_BYTE_ARRAY: - stream << "fixed_len_byte_array(" << node->type_length() << ")"; - break; - default: - break; - } -} - -static void PrintLogicalType(const PrimitiveNode* node, std::ostream& stream) { - auto lt = node->logical_type(); - if (lt == LogicalType::DECIMAL) { - stream << " (" << LogicalTypeToString(lt) << "(" << node->decimal_metadata().precision - << "," << node->decimal_metadata().scale << "))"; - } else if (lt != LogicalType::NONE) { - stream << " (" << LogicalTypeToString(lt) << ")"; - } -} - -void SchemaPrinter::Visit(const PrimitiveNode* node) { - PrintRepLevel(node->repetition(), stream_); - stream_ << " "; - PrintType(node, stream_); - stream_ << " " << node->name(); - PrintLogicalType(node, stream_); - stream_ << ";" << std::endl; -} - -void SchemaPrinter::Visit(const GroupNode* node) { - if (!node->parent()) { - stream_ << "message " << node->name() << " {" << std::endl; - } else { - PrintRepLevel(node->repetition(), stream_); - stream_ << " group " << node->name(); - auto lt = node->logical_type(); - if (lt != LogicalType::NONE) { stream_ << " (" << LogicalTypeToString(lt) << ")"; } - stream_ << " {" << std::endl; - } - - indent_ += indent_width_; - for (int i = 0; i < node->field_count(); ++i) { - node->field(i)->VisitConst(this); - } - indent_ -= indent_width_; - Indent(); - stream_ << "}" << std::endl; -} - -void SchemaPrinter::Indent() { - if (indent_ > 0) { - std::string spaces(indent_, ' '); - stream_ << spaces; - } -} - -void SchemaPrinter::Visit(const Node* node) { - Indent(); - if (node->is_group()) { - Visit(static_cast<const GroupNode*>(node)); - } else { - // Primitive - Visit(static_cast<const PrimitiveNode*>(node)); - } -} - -void PrintSchema(const Node* schema, std::ostream& stream, int indent_width) { - SchemaPrinter printer(stream, indent_width); - printer.Visit(schema); -} - -} // namespace schema - -} // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/printer.h ---------------------------------------------------------------------- diff --git a/src/parquet/schema/printer.h b/src/parquet/schema/printer.h deleted file mode 100644 index c37ef90..0000000 --- a/src/parquet/schema/printer.h +++ /dev/null @@ -1,40 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// A simple Schema printer using the visitor pattern - -#ifndef PARQUET_SCHEMA_PRINTER_H -#define PARQUET_SCHEMA_PRINTER_H - -#include <ostream> - -#include "parquet/util/visibility.h" - -namespace parquet { - -namespace schema { - -class Node; - -void PARQUET_EXPORT PrintSchema( - const Node* schema, std::ostream& stream, int indent_width = 2); - -} // namespace schema - -} // namespace parquet - -#endif // PARQUET_SCHEMA_PRINTER_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/schema-converter-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/schema-converter-test.cc b/src/parquet/schema/schema-converter-test.cc deleted file mode 100644 index c752919..0000000 --- a/src/parquet/schema/schema-converter-test.cc +++ /dev/null @@ -1,222 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <gtest/gtest.h> - -#include <cstdlib> -#include <memory> -#include <string> -#include <vector> - -#include "parquet/exception.h" -#include "parquet/schema/converter.h" -#include "parquet/schema/test-util.h" -#include "parquet/schema/types.h" -#include "parquet/thrift/parquet_types.h" -#include "parquet/types.h" - -using std::string; -using std::vector; - -using parquet::format::ConvertedType; -using parquet::format::FieldRepetitionType; -using parquet::format::SchemaElement; - -namespace parquet { - -namespace schema { - -// ---------------------------------------------------------------------- -// Test convert group - -class TestSchemaConverter : public ::testing::Test { - public: - void setUp() { name_ = "parquet_schema"; } - - void Convert(const parquet::format::SchemaElement* elements, int length) { - FlatSchemaConverter converter(elements, length); - node_ = converter.Convert(); - ASSERT_TRUE(node_->is_group()); - group_ = static_cast<const GroupNode*>(node_.get()); - } - - protected: - std::string name_; - const GroupNode* group_; - std::unique_ptr<Node> node_; -}; - -bool check_for_parent_consistency(const GroupNode* node) { - // Each node should have the group as parent - for (int i = 0; i < node->field_count(); i++) { - const NodePtr& field = node->field(i); - if (field->parent() != node) { return false; } - if (field->is_group()) { - const GroupNode* group = static_cast<GroupNode*>(field.get()); - if (!check_for_parent_consistency(group)) { return false; } - } - } - return true; -} - -TEST_F(TestSchemaConverter, NestedExample) { - SchemaElement elt; - std::vector<SchemaElement> elements; - elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0)); - - // A primitive one - elements.push_back( - NewPrimitive("a", FieldRepetitionType::REQUIRED, format::Type::INT32, 1)); - - // A group - elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2)); - - // 3-level list encoding, by hand - elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3); - elt.__set_converted_type(ConvertedType::LIST); - elements.push_back(elt); - elements.push_back( - NewPrimitive("item", FieldRepetitionType::OPTIONAL, format::Type::INT64, 4)); - - Convert(&elements[0], elements.size()); - - // Construct the expected schema - NodeVector fields; - fields.push_back(Int32("a", Repetition::REQUIRED)); - - // 3-level list encoding - NodePtr item = Int64("item"); - NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST)); - NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list})); - fields.push_back(bag); - - NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields); - - ASSERT_TRUE(schema->Equals(group_)); - - // Check that the parent relationship in each node is consitent - ASSERT_EQ(group_->parent(), nullptr); - ASSERT_TRUE(check_for_parent_consistency(group_)); -} - -TEST_F(TestSchemaConverter, InvalidRoot) { - // According to the Parquet specification, the first element in the - // list<SchemaElement> is a group whose children (and their descendants) - // contain all of the rest of the flattened schema elements. If the first - // element is not a group, it is a malformed Parquet file. - - SchemaElement elements[2]; - elements[0] = - NewPrimitive("not-a-group", FieldRepetitionType::REQUIRED, format::Type::INT32, 0); - ASSERT_THROW(Convert(elements, 2), ParquetException); - - // While the Parquet spec indicates that the root group should have REPEATED - // repetition type, some implementations may return REQUIRED or OPTIONAL - // groups as the first element. These tests check that this is okay as a - // practicality matter. - elements[0] = NewGroup("not-repeated", FieldRepetitionType::REQUIRED, 1, 0); - elements[1] = NewPrimitive("a", FieldRepetitionType::REQUIRED, format::Type::INT32, 1); - Convert(elements, 2); - - elements[0] = NewGroup("not-repeated", FieldRepetitionType::OPTIONAL, 1, 0); - Convert(elements, 2); -} - -TEST_F(TestSchemaConverter, NotEnoughChildren) { - // Throw a ParquetException, but don't core dump or anything - SchemaElement elt; - std::vector<SchemaElement> elements; - elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0)); - ASSERT_THROW(Convert(&elements[0], 1), ParquetException); -} - -// ---------------------------------------------------------------------- -// Schema tree flatten / unflatten - -class TestSchemaFlatten : public ::testing::Test { - public: - void setUp() { name_ = "parquet_schema"; } - - void Flatten(const GroupNode* schema) { ToParquet(schema, &elements_); } - - protected: - std::string name_; - std::vector<format::SchemaElement> elements_; -}; - -TEST_F(TestSchemaFlatten, DecimalMetadata) { - // Checks that DecimalMetadata is only set for DecimalTypes - NodePtr node = PrimitiveNode::Make( - "decimal", Repetition::REQUIRED, Type::INT64, LogicalType::DECIMAL, -1, 8, 4); - NodePtr group = - GroupNode::Make("group", Repetition::REPEATED, {node}, LogicalType::LIST); - Flatten(reinterpret_cast<GroupNode*>(group.get())); - ASSERT_EQ("decimal", elements_[1].name); - ASSERT_TRUE(elements_[1].__isset.precision); - ASSERT_TRUE(elements_[1].__isset.scale); - - elements_.clear(); - // Not for integers with no logical type - group = - GroupNode::Make("group", Repetition::REPEATED, {Int64("int64")}, LogicalType::LIST); - Flatten(reinterpret_cast<GroupNode*>(group.get())); - ASSERT_EQ("int64", elements_[1].name); - ASSERT_FALSE(elements_[0].__isset.precision); - ASSERT_FALSE(elements_[0].__isset.scale); -} - -TEST_F(TestSchemaFlatten, NestedExample) { - SchemaElement elt; - std::vector<SchemaElement> elements; - elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0)); - - // A primitive one - elements.push_back( - NewPrimitive("a", FieldRepetitionType::REQUIRED, format::Type::INT32, 1)); - - // A group - elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2)); - - // 3-level list encoding, by hand - elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3); - elt.__set_converted_type(ConvertedType::LIST); - elements.push_back(elt); - elements.push_back( - NewPrimitive("item", FieldRepetitionType::OPTIONAL, format::Type::INT64, 4)); - - // Construct the schema - NodeVector fields; - fields.push_back(Int32("a", Repetition::REQUIRED)); - - // 3-level list encoding - NodePtr item = Int64("item"); - NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST)); - NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list})); - fields.push_back(bag); - - NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields); - - Flatten(static_cast<GroupNode*>(schema.get())); - ASSERT_EQ(elements_.size(), elements.size()); - for (size_t i = 0; i < elements_.size(); i++) { - ASSERT_EQ(elements_[i], elements[i]); - } -} - -} // namespace schema - -} // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/schema-descriptor-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc deleted file mode 100644 index 4b7f67c..0000000 --- a/src/parquet/schema/schema-descriptor-test.cc +++ /dev/null @@ -1,190 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Schema / column descriptor correctness tests (from flat Parquet schemas) - -#include <cstdint> -#include <cstdlib> -#include <gtest/gtest.h> -#include <string> -#include <vector> - -#include "parquet/exception.h" -#include "parquet/schema/descriptor.h" -#include "parquet/schema/types.h" -#include "parquet/types.h" - -using std::string; -using std::vector; - -namespace parquet { - -namespace schema { - -TEST(TestColumnDescriptor, TestAttrs) { - NodePtr node = PrimitiveNode::Make( - "name", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::UTF8); - ColumnDescriptor descr(node, 4, 1); - - ASSERT_EQ("name", descr.name()); - ASSERT_EQ(4, descr.max_definition_level()); - ASSERT_EQ(1, descr.max_repetition_level()); - - ASSERT_EQ(Type::BYTE_ARRAY, descr.physical_type()); - - ASSERT_EQ(-1, descr.type_length()); - - // Test FIXED_LEN_BYTE_ARRAY - node = PrimitiveNode::Make("name", Repetition::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY, - LogicalType::DECIMAL, 12, 10, 4); - descr = ColumnDescriptor(node, 4, 1); - - ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, descr.physical_type()); - ASSERT_EQ(12, descr.type_length()); -} - -class TestSchemaDescriptor : public ::testing::Test { - public: - void setUp() {} - - protected: - SchemaDescriptor descr_; -}; - -TEST_F(TestSchemaDescriptor, InitNonGroup) { - NodePtr node = PrimitiveNode::Make("field", Repetition::OPTIONAL, Type::INT32); - - ASSERT_THROW(descr_.Init(node), ParquetException); -} - -TEST_F(TestSchemaDescriptor, Equals) { - NodePtr schema; - - NodePtr inta = Int32("a", Repetition::REQUIRED); - NodePtr intb = Int64("b", Repetition::OPTIONAL); - NodePtr intb2 = Int64("b2", Repetition::OPTIONAL); - NodePtr intc = ByteArray("c", Repetition::REPEATED); - - NodePtr item1 = Int64("item1", Repetition::REQUIRED); - NodePtr item2 = Boolean("item2", Repetition::OPTIONAL); - NodePtr item3 = Int32("item3", Repetition::REPEATED); - NodePtr list(GroupNode::Make( - "records", Repetition::REPEATED, {item1, item2, item3}, LogicalType::LIST)); - - NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list})); - NodePtr bag2(GroupNode::Make("bag", Repetition::REQUIRED, {list})); - - SchemaDescriptor descr1; - descr1.Init(GroupNode::Make("schema", Repetition::REPEATED, {inta, intb, intc, bag})); - - ASSERT_TRUE(descr1.Equals(descr1)); - - SchemaDescriptor descr2; - descr2.Init(GroupNode::Make("schema", Repetition::REPEATED, {inta, intb, intc, bag2})); - ASSERT_FALSE(descr1.Equals(descr2)); - - SchemaDescriptor descr3; - descr3.Init(GroupNode::Make("schema", Repetition::REPEATED, {inta, intb2, intc, bag})); - ASSERT_FALSE(descr1.Equals(descr3)); - - // Robust to name of parent node - SchemaDescriptor descr4; - descr4.Init(GroupNode::Make("SCHEMA", Repetition::REPEATED, {inta, intb, intc, bag})); - ASSERT_TRUE(descr1.Equals(descr4)); - - SchemaDescriptor descr5; - descr5.Init( - GroupNode::Make("schema", Repetition::REPEATED, {inta, intb, intc, bag, intb2})); - ASSERT_FALSE(descr1.Equals(descr5)); - - // Different max repetition / definition levels - ColumnDescriptor col1(inta, 5, 1); - ColumnDescriptor col2(inta, 6, 1); - ColumnDescriptor col3(inta, 5, 2); - - ASSERT_TRUE(col1.Equals(col1)); - ASSERT_FALSE(col1.Equals(col2)); - ASSERT_FALSE(col1.Equals(col3)); -} - -TEST_F(TestSchemaDescriptor, BuildTree) { - NodeVector fields; - NodePtr schema; - - NodePtr inta = Int32("a", Repetition::REQUIRED); - fields.push_back(inta); - fields.push_back(Int64("b", Repetition::OPTIONAL)); - fields.push_back(ByteArray("c", Repetition::REPEATED)); - - // 3-level list encoding - NodePtr item1 = Int64("item1", Repetition::REQUIRED); - NodePtr item2 = Boolean("item2", Repetition::OPTIONAL); - NodePtr item3 = Int32("item3", Repetition::REPEATED); - NodePtr list(GroupNode::Make( - "records", Repetition::REPEATED, {item1, item2, item3}, LogicalType::LIST)); - NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list})); - fields.push_back(bag); - - schema = GroupNode::Make("schema", Repetition::REPEATED, fields); - - descr_.Init(schema); - - int nleaves = 6; - - // 6 leaves - ASSERT_EQ(nleaves, descr_.num_columns()); - - // mdef mrep - // required int32 a 0 0 - // optional int64 b 1 0 - // repeated byte_array c 1 1 - // optional group bag 1 0 - // repeated group records 2 1 - // required int64 item1 2 1 - // optional boolean item2 3 1 - // repeated int32 item3 3 2 - int16_t ex_max_def_levels[6] = {0, 1, 1, 2, 3, 3}; - int16_t ex_max_rep_levels[6] = {0, 0, 1, 1, 1, 2}; - - for (int i = 0; i < nleaves; ++i) { - const ColumnDescriptor* col = descr_.Column(i); - EXPECT_EQ(ex_max_def_levels[i], col->max_definition_level()) << i; - EXPECT_EQ(ex_max_rep_levels[i], col->max_repetition_level()) << i; - } - - ASSERT_EQ(descr_.Column(0)->path()->ToDotString(), "a"); - ASSERT_EQ(descr_.Column(1)->path()->ToDotString(), "b"); - ASSERT_EQ(descr_.Column(2)->path()->ToDotString(), "c"); - ASSERT_EQ(descr_.Column(3)->path()->ToDotString(), "bag.records.item1"); - ASSERT_EQ(descr_.Column(4)->path()->ToDotString(), "bag.records.item2"); - ASSERT_EQ(descr_.Column(5)->path()->ToDotString(), "bag.records.item3"); - - ASSERT_EQ(inta.get(), descr_.GetColumnRoot(0).get()); - ASSERT_EQ(bag.get(), descr_.GetColumnRoot(3).get()); - ASSERT_EQ(bag.get(), descr_.GetColumnRoot(4).get()); - ASSERT_EQ(bag.get(), descr_.GetColumnRoot(5).get()); - - ASSERT_EQ(schema.get(), descr_.group_node()); - - // Init clears the leaves - descr_.Init(schema); - ASSERT_EQ(nleaves, descr_.num_columns()); -} - -} // namespace schema - -} // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/schema-printer-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/schema-printer-test.cc b/src/parquet/schema/schema-printer-test.cc deleted file mode 100644 index 29140f0..0000000 --- a/src/parquet/schema/schema-printer-test.cc +++ /dev/null @@ -1,76 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <gtest/gtest.h> - -#include <iosfwd> -#include <string> -#include <vector> - -#include "parquet/schema/printer.h" -#include "parquet/schema/types.h" -#include "parquet/types.h" - -using std::string; -using std::vector; - -namespace parquet { - -namespace schema { - -static std::string Print(const NodePtr& node) { - std::stringstream ss; - PrintSchema(node.get(), ss); - return ss.str(); -} - -TEST(TestSchemaPrinter, Examples) { - // Test schema 1 - NodeVector fields; - fields.push_back(Int32("a", Repetition::REQUIRED)); - - // 3-level list encoding - NodePtr item1 = Int64("item1"); - NodePtr item2 = Boolean("item2", Repetition::REQUIRED); - NodePtr list( - GroupNode::Make("b", Repetition::REPEATED, {item1, item2}, LogicalType::LIST)); - NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list})); - fields.push_back(bag); - - fields.push_back(PrimitiveNode::Make( - "c", Repetition::REQUIRED, Type::INT32, LogicalType::DECIMAL, -1, 3, 2)); - - NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, fields); - - std::string result = Print(schema); - std::string expected = R"(message schema { - required int32 a; - optional group bag { - repeated group b (LIST) { - optional int64 item1; - required boolean item2; - } - } - required int32 c (DECIMAL(3,2)); -} -)"; - ASSERT_EQ(expected, result); -} - -} // namespace schema - -} // namespace parquet
