Repository: parquet-cpp Updated Branches: refs/heads/master aabb3db2c -> c0fd08a97
PARQUET-573: Create a public API for reading and writing file metadata This patch adds API to read and write metadata as well as improves the writer properties class. I am planning to add some comments to the code next. Meanwhile, any feedback will be helpful. Author: Deepak Majeti <[email protected]> Author: Uwe L. Korn <[email protected]> Closes #143 from majetideepak/metadata and squashes the following commits: 2b8a546 [Deepak Majeti] comments and more testing for metadata api 59147c0 [Deepak Majeti] fix memory leak 34e8975 [Deepak Majeti] review comments and format a977c6a [Deepak Majeti] added comment for file path d4f0e82 [Deepak Majeti] friendship between reader and writer. implements PARQUET-692 1047507 [Uwe L. Korn] Better dictionary encoding user experience 7f37f85 [Deepak Majeti] review edits 9dab591 [Deepak Majeti] minor rename a6b0646 [Deepak Majeti] added more dictionary fallback and enabled options to writer properties 3b9bad3 [Deepak Majeti] Metadata Reader writer Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/c0fd08a9 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/c0fd08a9 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/c0fd08a9 Branch: refs/heads/master Commit: c0fd08a97c6817180372c90693b0b356cbce1f11 Parents: aabb3db Author: Deepak Majeti <[email protected]> Authored: Thu Sep 1 08:42:36 2016 -0400 Committer: Wes McKinney <[email protected]> Committed: Thu Sep 1 08:42:36 2016 -0400 ---------------------------------------------------------------------- CMakeLists.txt | 1 + example/parquet-dump-schema.cc | 2 +- src/parquet/CMakeLists.txt | 4 +- src/parquet/api/reader.h | 3 + src/parquet/column/properties.h | 173 +++++++-- src/parquet/column/writer.cc | 8 +- src/parquet/file/CMakeLists.txt | 2 + src/parquet/file/file-metadata-test.cc | 157 ++++++++ src/parquet/file/file-serialize-test.cc | 10 +- src/parquet/file/metadata.cc | 549 +++++++++++++++++++++++++++ src/parquet/file/metadata.h | 203 ++++++++++ src/parquet/file/reader-internal.cc | 88 +---- src/parquet/file/reader-internal.h | 25 +- src/parquet/file/reader.cc | 114 +++--- src/parquet/file/reader.h | 57 +-- src/parquet/reader-test.cc | 6 +- src/parquet/util/bpacking.h | 2 + 17 files changed, 1141 insertions(+), 263 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/CMakeLists.txt b/CMakeLists.txt index f833f2c..5c26e79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -498,6 +498,7 @@ set(LIBPARQUET_SRCS src/parquet/compression/snappy-codec.cc src/parquet/compression/gzip-codec.cc + src/parquet/file/metadata.cc src/parquet/file/reader.cc src/parquet/file/reader-internal.cc src/parquet/file/writer.cc http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/example/parquet-dump-schema.cc ---------------------------------------------------------------------- diff --git a/example/parquet-dump-schema.cc b/example/parquet-dump-schema.cc index 760359e..ed7b570 100644 --- a/example/parquet-dump-schema.cc +++ b/example/parquet-dump-schema.cc @@ -27,7 +27,7 @@ int main(int argc, char** argv) { try { std::unique_ptr<ParquetFileReader> reader = ParquetFileReader::OpenFile(filename); - PrintSchema(reader->descr()->schema().get(), std::cout); + PrintSchema(reader->metadata()->schema_descriptor()->schema().get(), std::cout); } catch (const std::exception& e) { std::cerr << "Parquet error: " << e.what() http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/src/parquet/CMakeLists.txt b/src/parquet/CMakeLists.txt index 7d4e905..a2ebbad 100644 --- a/src/parquet/CMakeLists.txt +++ b/src/parquet/CMakeLists.txt @@ -21,8 +21,6 @@ install(FILES types.h DESTINATION include/parquet) -ADD_PARQUET_TEST(public-api-test - LINKAGE shared) - +ADD_PARQUET_TEST(public-api-test) ADD_PARQUET_TEST(types-test) ADD_PARQUET_TEST(reader-test) http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/api/reader.h ---------------------------------------------------------------------- diff --git a/src/parquet/api/reader.h b/src/parquet/api/reader.h index 572ecf5..1e0c5e3 100644 --- a/src/parquet/api/reader.h +++ b/src/parquet/api/reader.h @@ -23,6 +23,9 @@ #include "parquet/exception.h" #include "parquet/file/reader.h" +// Metadata reader API +#include "parquet/file/metadata.h" + // Schemas #include "parquet/api/schema.h" http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/column/properties.h ---------------------------------------------------------------------- diff --git a/src/parquet/column/properties.h b/src/parquet/column/properties.h index 3234dbc..c8f103b 100644 --- a/src/parquet/column/properties.h +++ b/src/parquet/column/properties.h @@ -22,6 +22,7 @@ #include <string> #include <unordered_map> +#include "parquet/exception.h" #include "parquet/types.h" #include "parquet/schema/types.h" #include "parquet/util/input.h" @@ -77,11 +78,13 @@ class PARQUET_EXPORT ReaderProperties { ReaderProperties PARQUET_EXPORT default_reader_properties(); -static int64_t DEFAULT_PAGE_SIZE = 1024 * 1024; -static int64_t DEFAULT_DICTIONARY_PAGE_SIZE = DEFAULT_PAGE_SIZE; -static Encoding::type DEFAULT_ENCODING = Encoding::PLAIN; +static constexpr int64_t DEFAULT_PAGE_SIZE = 1024 * 1024; +static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true; +static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE = DEFAULT_PAGE_SIZE; +static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN; static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION = ParquetVersion::PARQUET_1_0; +static std::string DEFAULT_CREATED_BY = "Apache parquet-cpp"; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; using ColumnCodecs = std::unordered_map<std::string, Compression::type>; @@ -92,10 +95,12 @@ class PARQUET_EXPORT WriterProperties { public: Builder() : allocator_(default_allocator()), + dictionary_enabled_default_(DEFAULT_IS_DICTIONARY_ENABLED), dictionary_pagesize_(DEFAULT_DICTIONARY_PAGE_SIZE), - default_encoding_(DEFAULT_ENCODING), pagesize_(DEFAULT_PAGE_SIZE), version_(DEFAULT_WRITER_VERSION), + created_by_(DEFAULT_CREATED_BY), + default_encoding_(DEFAULT_ENCODING), default_codec_(DEFAULT_COMPRESSION_TYPE) {} virtual ~Builder() {} @@ -104,6 +109,34 @@ class PARQUET_EXPORT WriterProperties { return this; } + Builder* enable_dictionary() { + dictionary_enabled_default_ = true; + return this; + } + + Builder* disable_dictionary() { + dictionary_enabled_default_ = false; + return this; + } + + Builder* enable_dictionary(const std::string& path) { + dictionary_enabled_[path] = true; + return this; + } + + Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) { + return this->enable_dictionary(path->ToDotString()); + } + + Builder* disable_dictionary(const std::string& path) { + dictionary_enabled_[path] = true; + return this; + } + + Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) { + return this->enable_dictionary(path->ToDotString()); + } + Builder* dictionary_pagesize(int64_t dictionary_psize) { dictionary_pagesize_ = dictionary_psize; return this; @@ -114,26 +147,57 @@ class PARQUET_EXPORT WriterProperties { return this; } - Builder* encoding( - const std::shared_ptr<schema::ColumnPath>& path, Encoding::type encoding_type) { - return encoding(path->ToDotString(), encoding_type); + Builder* version(ParquetVersion::type version) { + version_ = version; + return this; } - Builder* encoding(const std::string& column_path, Encoding::type encoding_type) { - encodings_[column_path] = encoding_type; + Builder* created_by(const std::string& created_by) { + created_by_ = created_by; return this; } + /** + * Define the encoding that is used when we don't utilise dictionary encoding. + * + * This either apply if dictionary encoding is disabled or if we fallback + * as the dictionary grew too large. + */ Builder* encoding(Encoding::type encoding_type) { + if (encoding_type == Encoding::PLAIN_DICTIONARY || + encoding_type == Encoding::RLE_DICTIONARY) { + throw ParquetException("Can't use dictionary encoding as fallback encoding"); + } default_encoding_ = encoding_type; return this; } - Builder* version(ParquetVersion::type version) { - version_ = version; + /** + * Define the encoding that is used when we don't utilise dictionary encoding. + * + * This either apply if dictionary encoding is disabled or if we fallback + * as the dictionary grew too large. + */ + Builder* encoding(const std::string& path, Encoding::type encoding_type) { + if (encoding_type == Encoding::PLAIN_DICTIONARY || + encoding_type == Encoding::RLE_DICTIONARY) { + throw ParquetException("Can't use dictionary encoding as fallback encoding"); + } + encodings_[path] = encoding_type; return this; } + /** + * Define the encoding that is used when we don't utilise dictionary encoding. + * + * This either apply if dictionary encoding is disabled or if we fallback + * as the dictionary grew too large. + */ + Builder* encoding( + const std::shared_ptr<schema::ColumnPath>& path, Encoding::type encoding_type) { + return this->encoding(path->ToDotString(), encoding_type); + } + Builder* compression(Compression::type codec) { default_codec_ = codec; return this; @@ -151,76 +215,101 @@ class PARQUET_EXPORT WriterProperties { std::shared_ptr<WriterProperties> build() { return std::shared_ptr<WriterProperties>( - new WriterProperties(allocator_, dictionary_pagesize_, default_encoding_, - encodings_, pagesize_, version_, default_codec_, codecs_)); + new WriterProperties(allocator_, dictionary_enabled_default_, + dictionary_enabled_, dictionary_pagesize_, pagesize_, version_, created_by_, + default_encoding_, encodings_, default_codec_, codecs_)); } private: MemoryAllocator* allocator_; + bool dictionary_enabled_default_; + std::unordered_map<std::string, bool> dictionary_enabled_; int64_t dictionary_pagesize_; + int64_t pagesize_; + ParquetVersion::type version_; + std::string created_by_; // Encoding used for each column if not a specialized one is defined as // part of encodings_ Encoding::type default_encoding_; std::unordered_map<std::string, Encoding::type> encodings_; - int64_t pagesize_; - ParquetVersion::type version_; // Default compression codec. This will be used for all columns that do // not have a specific codec set as part of codecs_ Compression::type default_codec_; ColumnCodecs codecs_; }; - MemoryAllocator* allocator() const { return allocator_; } + inline MemoryAllocator* allocator() const { return allocator_; } - int64_t dictionary_pagesize() const { return dictionary_pagesize_; } + inline bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const { + auto it = dictionary_enabled_.find(path->ToDotString()); + if (it != dictionary_enabled_.end()) { return it->second; } + return dictionary_enabled_default_; + } + + inline int64_t dictionary_pagesize() const { return dictionary_pagesize_; } - int64_t data_pagesize() const { return pagesize_; } + inline int64_t data_pagesize() const { return pagesize_; } - ParquetVersion::type version() const { return parquet_version_; } + inline ParquetVersion::type version() const { return parquet_version_; } - Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const { - Encoding::type coding = default_encoding_; + inline std::string created_by() const { return parquet_created_by_; } + + inline Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const { auto it = encodings_.find(path->ToDotString()); - if (it != encodings_.end()) { coding = it->second; } - - // Use the correct enum value for dictionary coding based on the used Parquet version - if (coding == Encoding::PLAIN_DICTIONARY || coding == Encoding::RLE_DICTIONARY) { - if (parquet_version_ == ParquetVersion::PARQUET_1_0) { - return Encoding::PLAIN_DICTIONARY; - } else { - return Encoding::RLE_DICTIONARY; - } + if (it != encodings_.end()) { return it->second; } + return default_encoding_; + } + + inline Encoding::type dictionary_index_encoding() const { + if (parquet_version_ == ParquetVersion::PARQUET_1_0) { + return Encoding::PLAIN_DICTIONARY; + } else { + return Encoding::RLE_DICTIONARY; } - return coding; } - Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const { + inline Encoding::type dictionary_page_encoding() const { + if (parquet_version_ == ParquetVersion::PARQUET_1_0) { + return Encoding::PLAIN_DICTIONARY; + } else { + return Encoding::PLAIN; + } + } + + inline Compression::type compression( + const std::shared_ptr<schema::ColumnPath>& path) const { auto it = codecs_.find(path->ToDotString()); if (it != codecs_.end()) return it->second; return default_codec_; } private: - explicit WriterProperties(MemoryAllocator* allocator, int64_t dictionary_pagesize, - Encoding::type default_encoding, - const std::unordered_map<std::string, Encoding::type>& encodings, int64_t pagesize, - ParquetVersion::type version, Compression::type default_codec, - const ColumnCodecs& codecs) + explicit WriterProperties(MemoryAllocator* allocator, bool dictionary_enabled_default, + std::unordered_map<std::string, bool> dictionary_enabled, + int64_t dictionary_pagesize, int64_t pagesize, ParquetVersion::type version, + const std::string& created_by, Encoding::type default_encoding, + std::unordered_map<std::string, Encoding::type> encodings, + Compression::type default_codec, const ColumnCodecs& codecs) : allocator_(allocator), + dictionary_enabled_default_(dictionary_enabled_default), + dictionary_enabled_(dictionary_enabled), dictionary_pagesize_(dictionary_pagesize), - default_encoding_(default_encoding), - encodings_(encodings), pagesize_(pagesize), parquet_version_(version), + parquet_created_by_(created_by), + default_encoding_(default_encoding), + encodings_(encodings), default_codec_(default_codec), codecs_(codecs) {} - MemoryAllocator* allocator_; + bool dictionary_enabled_default_; + std::unordered_map<std::string, bool> dictionary_enabled_; int64_t dictionary_pagesize_; - Encoding::type default_encoding_; - std::unordered_map<std::string, Encoding::type> encodings_; int64_t pagesize_; ParquetVersion::type parquet_version_; + std::string parquet_created_by_; + Encoding::type default_encoding_; + std::unordered_map<std::string, Encoding::type> encodings_; Compression::type default_codec_; ColumnCodecs codecs_; }; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/column/writer.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/writer.cc b/src/parquet/column/writer.cc index 7845c58..124486c 100644 --- a/src/parquet/column/writer.cc +++ b/src/parquet/column/writer.cc @@ -182,9 +182,8 @@ void TypedColumnWriter<Type>::WriteDictionaryPage() { // TODO Get rid of this deep call dict_encoder->mem_pool()->FreeAll(); - Encoding::type dict_encoding = Encoding::PLAIN_DICTIONARY; - if (encoding_ == Encoding::RLE_DICTIONARY) { dict_encoding = Encoding::PLAIN; } - DictionaryPage page(buffer, dict_encoder->num_entries(), dict_encoding); + DictionaryPage page( + buffer, dict_encoder->num_entries(), properties_->dictionary_index_encoding()); total_bytes_written_ += pager_->WriteDictionaryPage(page); } @@ -195,6 +194,9 @@ std::shared_ptr<ColumnWriter> ColumnWriter::Make(const ColumnDescriptor* descr, std::unique_ptr<PageWriter> pager, int64_t expected_rows, const WriterProperties* properties) { Encoding::type encoding = properties->encoding(descr->path()); + if (properties->dictionary_enabled(descr->path())) { + encoding = properties->dictionary_page_encoding(); + } switch (descr->physical_type()) { case Type::BOOLEAN: return std::make_shared<BoolWriter>( http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/src/parquet/file/CMakeLists.txt b/src/parquet/file/CMakeLists.txt index acfb513..fa995b8 100644 --- a/src/parquet/file/CMakeLists.txt +++ b/src/parquet/file/CMakeLists.txt @@ -16,9 +16,11 @@ # under the License. install(FILES + metadata.h reader.h writer.h DESTINATION include/parquet/file) ADD_PARQUET_TEST(file-deserialize-test) +ADD_PARQUET_TEST(file-metadata-test) ADD_PARQUET_TEST(file-serialize-test) http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/file-metadata-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/file-metadata-test.cc b/src/parquet/file/file-metadata-test.cc new file mode 100644 index 0000000..5fbd613 --- /dev/null +++ b/src/parquet/file/file-metadata-test.cc @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> +#include "parquet/file/metadata.h" +#include "parquet/schema/descriptor.h" +#include "parquet/schema/types.h" +#include "parquet/types.h" + +namespace parquet { + +namespace metadata { + +TEST(Metadata, TestBuildAccess) { + parquet::schema::NodeVector fields; + parquet::schema::NodePtr root; + parquet::SchemaDescriptor schema; + + std::shared_ptr<WriterProperties> props = WriterProperties::Builder().build(); + + fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); + fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); + root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); + schema.Init(root); + + int64_t nrows = 1000; + ColumnStatistics stats_int; + stats_int.null_count = 0; + stats_int.distinct_count = nrows; + std::string int_min = std::string("100"); + std::string int_max = std::string("200"); + stats_int.min = &int_min; + stats_int.max = &int_max; + ColumnStatistics stats_float; + stats_float.null_count = 0; + stats_float.distinct_count = nrows; + std::string float_min = std::string("100.100"); + std::string float_max = std::string("200.200"); + stats_float.min = &float_min; + stats_float.max = &float_max; + + auto f_builder = FileMetaDataBuilder::Make(&schema, props); + auto rg1_builder = f_builder->AppendRowGroup(); + auto rg2_builder = f_builder->AppendRowGroup(); + + // Write the metadata + // rowgroup1 metadata + auto col1_builder = rg1_builder->NextColumnChunk(); + auto col2_builder = rg1_builder->NextColumnChunk(); + // column metadata + col1_builder->SetStatistics(stats_int); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, false); + col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, false); + rg1_builder->Finish(nrows / 2); + + // rowgroup2 metadata + col1_builder = rg2_builder->NextColumnChunk(); + col2_builder = rg2_builder->NextColumnChunk(); + // column metadata + col1_builder->SetStatistics(stats_int); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, false); + col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, false); + rg2_builder->Finish(nrows / 2); + + // Read the metadata + auto f_accessor = f_builder->Finish(); + + // file metadata + ASSERT_EQ(nrows, f_accessor->num_rows()); + ASSERT_EQ(2, f_accessor->num_row_groups()); + ASSERT_EQ(DEFAULT_WRITER_VERSION, f_accessor->version()); + ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); + ASSERT_EQ(3, f_accessor->num_schema_elements()); + + // row group1 metadata + auto rg1_accessor = f_accessor->RowGroup(0); + ASSERT_EQ(2, rg1_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); + ASSERT_EQ(1024, rg1_accessor->total_byte_size()); + + auto rg1_column1 = rg1_accessor->ColumnChunk(0); + auto rg1_column2 = rg1_accessor->ColumnChunk(1); + ASSERT_EQ(true, rg1_column1->is_stats_set()); + ASSERT_EQ(true, rg1_column2->is_stats_set()); + ASSERT_EQ("100.100", *rg1_column2->statistics().min); + ASSERT_EQ("200.200", *rg1_column2->statistics().max); + ASSERT_EQ("100", *rg1_column1->statistics().min); + ASSERT_EQ("200", *rg1_column1->statistics().max); + ASSERT_EQ(0, rg1_column1->statistics().null_count); + ASSERT_EQ(0, rg1_column2->statistics().null_count); + ASSERT_EQ(nrows, rg1_column1->statistics().distinct_count); + ASSERT_EQ(nrows, rg1_column2->statistics().distinct_count); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); + ASSERT_EQ(nrows / 2, rg1_column1->num_values()); + ASSERT_EQ(nrows / 2, rg1_column2->num_values()); + ASSERT_EQ(2, rg1_column1->encodings().size()); + ASSERT_EQ(2, rg1_column2->encodings().size()); + ASSERT_EQ(512, rg1_column1->total_compressed_size()); + ASSERT_EQ(512, rg1_column2->total_compressed_size()); + ASSERT_EQ(600, rg1_column1->total_uncompressed_size()); + ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); + ASSERT_EQ(4, rg1_column1->dictionary_page_offset()); + ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); + ASSERT_EQ(10, rg1_column1->data_page_offset()); + ASSERT_EQ(30, rg1_column2->data_page_offset()); + + auto rg2_accessor = f_accessor->RowGroup(1); + ASSERT_EQ(2, rg2_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); + ASSERT_EQ(1024, rg2_accessor->total_byte_size()); + + auto rg2_column1 = rg2_accessor->ColumnChunk(0); + auto rg2_column2 = rg2_accessor->ColumnChunk(1); + ASSERT_EQ(true, rg2_column1->is_stats_set()); + ASSERT_EQ(true, rg2_column2->is_stats_set()); + ASSERT_EQ("100.100", *rg2_column2->statistics().min); + ASSERT_EQ("200.200", *rg2_column2->statistics().max); + ASSERT_EQ("100", *rg2_column1->statistics().min); + ASSERT_EQ("200", *rg2_column1->statistics().max); + ASSERT_EQ(0, rg2_column1->statistics().null_count); + ASSERT_EQ(0, rg2_column2->statistics().null_count); + ASSERT_EQ(nrows, rg2_column1->statistics().distinct_count); + ASSERT_EQ(nrows, rg2_column2->statistics().distinct_count); + ASSERT_EQ(nrows / 2, rg2_column1->num_values()); + ASSERT_EQ(nrows / 2, rg2_column2->num_values()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); + ASSERT_EQ(2, rg2_column1->encodings().size()); + ASSERT_EQ(2, rg2_column2->encodings().size()); + ASSERT_EQ(512, rg2_column1->total_compressed_size()); + ASSERT_EQ(512, rg2_column2->total_compressed_size()); + ASSERT_EQ(600, rg2_column1->total_uncompressed_size()); + ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); + ASSERT_EQ(6, rg2_column1->dictionary_page_offset()); + ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); + ASSERT_EQ(10, rg2_column1->data_page_offset()); + ASSERT_EQ(26, rg2_column2->data_page_offset()); +} +} // namespace metadata +} // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/file-serialize-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/file-serialize-test.cc b/src/parquet/file/file-serialize-test.cc index ca7bb45..bd41e1e 100644 --- a/src/parquet/file/file-serialize-test.cc +++ b/src/parquet/file/file-serialize-test.cc @@ -79,13 +79,13 @@ class TestSerialize : public ::testing::Test { auto buffer = sink->GetBuffer(); std::unique_ptr<RandomAccessSource> source(new BufferReader(buffer)); auto file_reader = ParquetFileReader::Open(std::move(source)); - ASSERT_EQ(1, file_reader->num_columns()); - ASSERT_EQ(1, file_reader->num_row_groups()); - ASSERT_EQ(100, file_reader->num_rows()); + ASSERT_EQ(1, file_reader->metadata()->num_columns()); + ASSERT_EQ(1, file_reader->metadata()->num_row_groups()); + ASSERT_EQ(100, file_reader->metadata()->num_rows()); auto rg_reader = file_reader->RowGroup(0); - ASSERT_EQ(1, rg_reader->num_columns()); - ASSERT_EQ(100, rg_reader->num_rows()); + ASSERT_EQ(1, rg_reader->metadata()->num_columns()); + ASSERT_EQ(100, rg_reader->metadata()->num_rows()); auto col_reader = std::static_pointer_cast<Int64Reader>(rg_reader->Column(0)); std::vector<int64_t> values_out(100); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/metadata.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc new file mode 100644 index 0000000..c1fd767 --- /dev/null +++ b/src/parquet/file/metadata.cc @@ -0,0 +1,549 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <vector> + +#include "parquet/file/metadata.h" +#include "parquet/schema/converter.h" +#include "parquet/thrift/util.h" + +namespace parquet { + +// MetaData Accessor +// ColumnChunk metadata +class ColumnChunkMetaData::ColumnChunkMetaDataImpl { + public: + explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column) : column_(column) { + const format::ColumnMetaData& meta_data = column->meta_data; + for (auto encoding : meta_data.encodings) { + encodings_.push_back(FromThrift(encoding)); + } + if (meta_data.__isset.statistics) { + stats_.null_count = meta_data.statistics.null_count; + stats_.distinct_count = meta_data.statistics.distinct_count; + stats_.max = &meta_data.statistics.max; + stats_.min = &meta_data.statistics.min; + } + } + ~ColumnChunkMetaDataImpl() {} + + // column chunk + inline int64_t file_offset() const { return column_->file_offset; } + inline const std::string& file_path() const { return column_->file_path; } + + // column metadata + inline Type::type type() { return FromThrift(column_->meta_data.type); } + + inline int64_t num_values() const { return column_->meta_data.num_values; } + + std::shared_ptr<schema::ColumnPath> path_in_schema() { + return std::make_shared<schema::ColumnPath>(column_->meta_data.path_in_schema); + } + + inline bool is_stats_set() const { return column_->meta_data.__isset.statistics; } + + inline const ColumnStatistics& statistics() const { return stats_; } + + inline Compression::type compression() const { + return FromThrift(column_->meta_data.codec); + } + + const std::vector<Encoding::type>& encodings() const { return encodings_; } + + inline int64_t has_dictionary_page() const { + return column_->meta_data.__isset.dictionary_page_offset; + } + + inline int64_t dictionary_page_offset() const { + return column_->meta_data.dictionary_page_offset; + } + + inline int64_t data_page_offset() const { return column_->meta_data.data_page_offset; } + + inline int64_t index_page_offset() const { + return column_->meta_data.index_page_offset; + } + + inline int64_t total_compressed_size() const { + return column_->meta_data.total_compressed_size; + } + + inline int64_t total_uncompressed_size() const { + return column_->meta_data.total_uncompressed_size; + } + + private: + ColumnStatistics stats_; + std::vector<Encoding::type> encodings_; + const format::ColumnChunk* column_; +}; + +std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(const uint8_t* metadata) { + return std::unique_ptr<ColumnChunkMetaData>(new ColumnChunkMetaData(metadata)); +} + +ColumnChunkMetaData::ColumnChunkMetaData(const uint8_t* metadata) + : impl_{std::unique_ptr<ColumnChunkMetaDataImpl>(new ColumnChunkMetaDataImpl( + reinterpret_cast<const format::ColumnChunk*>(metadata)))} {} +ColumnChunkMetaData::~ColumnChunkMetaData() {} + +// column chunk +int64_t ColumnChunkMetaData::file_offset() const { + return impl_->file_offset(); +} + +const std::string& ColumnChunkMetaData::file_path() const { + return impl_->file_path(); +} + +// column metadata +Type::type ColumnChunkMetaData::type() const { + return impl_->type(); +} + +int64_t ColumnChunkMetaData::num_values() const { + return impl_->num_values(); +} + +std::shared_ptr<schema::ColumnPath> ColumnChunkMetaData::path_in_schema() const { + return impl_->path_in_schema(); +} + +const ColumnStatistics& ColumnChunkMetaData::statistics() const { + return impl_->statistics(); +} + +bool ColumnChunkMetaData::is_stats_set() const { + return impl_->is_stats_set(); +} + +int64_t ColumnChunkMetaData::has_dictionary_page() const { + return impl_->has_dictionary_page(); +} + +int64_t ColumnChunkMetaData::dictionary_page_offset() const { + return impl_->dictionary_page_offset(); +} + +int64_t ColumnChunkMetaData::data_page_offset() const { + return impl_->data_page_offset(); +} + +int64_t ColumnChunkMetaData::index_page_offset() const { + return impl_->index_page_offset(); +} + +Compression::type ColumnChunkMetaData::compression() const { + return impl_->compression(); +} + +const std::vector<Encoding::type>& ColumnChunkMetaData::encodings() const { + return impl_->encodings(); +} + +int64_t ColumnChunkMetaData::total_uncompressed_size() const { + return impl_->total_uncompressed_size(); +} + +int64_t ColumnChunkMetaData::total_compressed_size() const { + return impl_->total_compressed_size(); +} + +// row-group metadata +class RowGroupMetaData::RowGroupMetaDataImpl { + public: + explicit RowGroupMetaDataImpl(const format::RowGroup* row_group) + : row_group_(row_group) {} + ~RowGroupMetaDataImpl() {} + + inline int num_columns() const { return row_group_->columns.size(); } + + inline int64_t num_rows() const { return row_group_->num_rows; } + + inline int64_t total_byte_size() const { return row_group_->total_byte_size; } + + std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) { + DCHECK(i < num_columns()) << "The file only has " << num_columns() + << " columns, requested metadata for column: " << i; + return ColumnChunkMetaData::Make( + reinterpret_cast<const uint8_t*>(&row_group_->columns[i])); + } + + private: + const format::RowGroup* row_group_; +}; + +std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(const uint8_t* metadata) { + return std::unique_ptr<RowGroupMetaData>(new RowGroupMetaData(metadata)); +} + +RowGroupMetaData::RowGroupMetaData(const uint8_t* metadata) + : impl_{std::unique_ptr<RowGroupMetaDataImpl>(new RowGroupMetaDataImpl( + reinterpret_cast<const format::RowGroup*>(metadata)))} {} +RowGroupMetaData::~RowGroupMetaData() {} + +int RowGroupMetaData::num_columns() const { + return impl_->num_columns(); +} + +int64_t RowGroupMetaData::num_rows() const { + return impl_->num_rows(); +} + +int64_t RowGroupMetaData::total_byte_size() const { + return impl_->total_byte_size(); +} + +std::unique_ptr<ColumnChunkMetaData> RowGroupMetaData::ColumnChunk(int i) const { + return impl_->ColumnChunk(i); +} + +// file metadata +class FileMetaData::FileMetaDataImpl { + public: + FileMetaDataImpl() {} + + explicit FileMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) { + metadata_.reset(new format::FileMetaData); + DeserializeThriftMsg(metadata, metadata_len, metadata_.get()); + InitSchema(); + } + ~FileMetaDataImpl() {} + + inline int num_columns() const { return schema_.num_columns(); } + inline int64_t num_rows() const { return metadata_->num_rows; } + inline int num_row_groups() const { return metadata_->row_groups.size(); } + inline int32_t version() const { return metadata_->version; } + inline const std::string& created_by() const { return metadata_->created_by; } + inline int num_schema_elements() const { return metadata_->schema.size(); } + + void WriteTo(OutputStream* dst) { SerializeThriftMsg(metadata_.get(), 1024, dst); } + + std::unique_ptr<RowGroupMetaData> RowGroup(int i) { + DCHECK(i < num_row_groups()) + << "The file only has " << num_row_groups() + << " row groups, requested metadata for row group: " << i; + return RowGroupMetaData::Make( + reinterpret_cast<const uint8_t*>(&metadata_->row_groups[i])); + } + + const SchemaDescriptor* schema_descriptor() const { return &schema_; } + + private: + friend FileMetaDataBuilder; + std::unique_ptr<format::FileMetaData> metadata_; + void InitSchema() { + schema::FlatSchemaConverter converter( + &metadata_->schema[0], metadata_->schema.size()); + schema_.Init(converter.Convert()); + } + SchemaDescriptor schema_; +}; + +std::unique_ptr<FileMetaData> FileMetaData::Make( + const uint8_t* metadata, uint32_t* metadata_len) { + return std::unique_ptr<FileMetaData>(new FileMetaData(metadata, metadata_len)); +} + +FileMetaData::FileMetaData(const uint8_t* metadata, uint32_t* metadata_len) + : impl_{std::unique_ptr<FileMetaDataImpl>( + new FileMetaDataImpl(metadata, metadata_len))} {} + +FileMetaData::FileMetaData() + : impl_{std::unique_ptr<FileMetaDataImpl>(new FileMetaDataImpl())} {} + +FileMetaData::~FileMetaData() {} + +std::unique_ptr<RowGroupMetaData> FileMetaData::RowGroup(int i) const { + return impl_->RowGroup(i); +} + +int FileMetaData::num_columns() const { + return impl_->num_columns(); +} + +int64_t FileMetaData::num_rows() const { + return impl_->num_rows(); +} + +int FileMetaData::num_row_groups() const { + return impl_->num_row_groups(); +} + +int32_t FileMetaData::version() const { + return impl_->version(); +} + +const std::string& FileMetaData::created_by() const { + return impl_->created_by(); +} + +int FileMetaData::num_schema_elements() const { + return impl_->num_schema_elements(); +} + +const SchemaDescriptor* FileMetaData::schema_descriptor() const { + return impl_->schema_descriptor(); +} + +void FileMetaData::WriteTo(OutputStream* dst) { + return impl_->WriteTo(dst); +} + +// MetaData Builders +// row-group metadata +class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { + public: + explicit ColumnChunkMetaDataBuilderImpl(const std::shared_ptr<WriterProperties>& props, + const ColumnDescriptor* column, uint8_t* contents) + : properties_(props), column_(column) { + column_chunk_ = reinterpret_cast<format::ColumnChunk*>(contents); + column_chunk_->meta_data.__set_type(ToThrift(column->physical_type())); + column_chunk_->meta_data.__set_path_in_schema(column->path()->ToDotVector()); + column_chunk_->meta_data.__set_codec( + ToThrift(properties_->compression(column->path()))); + } + ~ColumnChunkMetaDataBuilderImpl() {} + + // column chunk + void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); } + + // column metadata + void SetStatistics(const ColumnStatistics& val) { + format::Statistics stats; + stats.null_count = val.null_count; + stats.distinct_count = val.distinct_count; + stats.max = *val.max; + stats.min = *val.min; + + column_chunk_->meta_data.statistics = stats; + column_chunk_->meta_data.__isset.statistics = true; + } + + void Finish(int64_t num_values, int64_t dictionary_page_offset, + int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, + int64_t uncompressed_size, bool dictionary_fallback = false) { + if (dictionary_page_offset > 0) { + column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); + } else { + column_chunk_->__set_file_offset(data_page_offset + compressed_size); + } + column_chunk_->__isset.meta_data = true; + column_chunk_->meta_data.__set_num_values(num_values); + column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); + column_chunk_->meta_data.__set_index_page_offset(index_page_offset); + column_chunk_->meta_data.__set_data_page_offset(data_page_offset); + column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size); + column_chunk_->meta_data.__set_total_compressed_size(compressed_size); + std::vector<format::Encoding::type> thrift_encodings; + thrift_encodings.push_back(ToThrift(Encoding::RLE)); + if (properties_->dictionary_enabled(column_->path())) { + thrift_encodings.push_back(ToThrift(properties_->dictionary_page_encoding())); + // add the encoding only if it is unique + if (properties_->version() == ParquetVersion::PARQUET_2_0) { + thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding())); + } + } + if (!properties_->dictionary_enabled(column_->path()) || dictionary_fallback) { + thrift_encodings.push_back(ToThrift(properties_->encoding(column_->path()))); + } + column_chunk_->meta_data.__set_encodings(thrift_encodings); + } + + private: + format::ColumnChunk* column_chunk_; + const std::shared_ptr<WriterProperties> properties_; + const ColumnDescriptor* column_; +}; + +std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make( + const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column, + uint8_t* contents) { + return std::unique_ptr<ColumnChunkMetaDataBuilder>( + new ColumnChunkMetaDataBuilder(props, column, contents)); +} + +ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder( + const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column, + uint8_t* contents) + : impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>( + new ColumnChunkMetaDataBuilderImpl(props, column, contents))} {} + +ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() {} + +void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) { + impl_->set_file_path(path); +} + +void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, + int64_t dictionary_page_offset, int64_t index_page_offset, int64_t data_page_offset, + int64_t compressed_size, int64_t uncompressed_size, bool dictionary_fallback) { + impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset, + compressed_size, uncompressed_size, dictionary_fallback); +} + +void ColumnChunkMetaDataBuilder::SetStatistics(const ColumnStatistics& result) { + impl_->SetStatistics(result); +} + +class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { + public: + explicit RowGroupMetaDataBuilderImpl(const std::shared_ptr<WriterProperties>& props, + const SchemaDescriptor* schema, uint8_t* contents) + : properties_(props), schema_(schema), current_column_(0) { + row_group_ = reinterpret_cast<format::RowGroup*>(contents); + InitializeColumns(schema->num_columns()); + } + ~RowGroupMetaDataBuilderImpl() {} + + ColumnChunkMetaDataBuilder* NextColumnChunk() { + DCHECK(current_column_ < num_columns()) + << "The schema only has " << num_columns() + << " columns, requested metadata for column: " << current_column_; + auto column = schema_->Column(current_column_); + auto column_builder = ColumnChunkMetaDataBuilder::Make(properties_, column, + reinterpret_cast<uint8_t*>(&row_group_->columns[current_column_++])); + auto column_builder_ptr = column_builder.get(); + column_builders_.push_back(std::move(column_builder)); + return column_builder_ptr; + } + + void Finish(int64_t num_rows) { + DCHECK(current_column_ == schema_->num_columns()) + << "Only " << current_column_ - 1 << " out of " << schema_->num_columns() + << " columns are initialized"; + size_t total_byte_size = 0; + + for (int i = 0; i < schema_->num_columns(); i++) { + DCHECK(row_group_->columns[i].file_offset > 0) << "Column " << i + << " is not complete."; + total_byte_size += row_group_->columns[i].meta_data.total_compressed_size; + } + + row_group_->__set_total_byte_size(total_byte_size); + row_group_->__set_num_rows(num_rows); + } + + private: + int num_columns() { return row_group_->columns.size(); } + + void InitializeColumns(int ncols) { row_group_->columns.resize(ncols); } + + format::RowGroup* row_group_; + const std::shared_ptr<WriterProperties> properties_; + const SchemaDescriptor* schema_; + std::vector<std::unique_ptr<ColumnChunkMetaDataBuilder>> column_builders_; + int current_column_; +}; + +std::unique_ptr<RowGroupMetaDataBuilder> RowGroupMetaDataBuilder::Make( + const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_, + uint8_t* contents) { + return std::unique_ptr<RowGroupMetaDataBuilder>( + new RowGroupMetaDataBuilder(props, schema_, contents)); +} + +RowGroupMetaDataBuilder::RowGroupMetaDataBuilder( + const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_, + uint8_t* contents) + : impl_{std::unique_ptr<RowGroupMetaDataBuilderImpl>( + new RowGroupMetaDataBuilderImpl(props, schema_, contents))} {} + +RowGroupMetaDataBuilder::~RowGroupMetaDataBuilder() {} + +ColumnChunkMetaDataBuilder* RowGroupMetaDataBuilder::NextColumnChunk() { + return impl_->NextColumnChunk(); +} + +void RowGroupMetaDataBuilder::Finish(int64_t num_rows) { + impl_->Finish(num_rows); +} + +// file metadata +class FileMetaDataBuilder::FileMetaDataBuilderImpl { + public: + explicit FileMetaDataBuilderImpl( + const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props) + : properties_(props), schema_(schema) { + metadata_.reset(new format::FileMetaData()); + } + ~FileMetaDataBuilderImpl() {} + + RowGroupMetaDataBuilder* AppendRowGroup() { + auto row_group = std::unique_ptr<format::RowGroup>(new format::RowGroup()); + auto row_group_builder = RowGroupMetaDataBuilder::Make( + properties_, schema_, reinterpret_cast<uint8_t*>(row_group.get())); + RowGroupMetaDataBuilder* row_group_ptr = row_group_builder.get(); + row_group_builders_.push_back(std::move(row_group_builder)); + row_groups_.push_back(std::move(row_group)); + return row_group_ptr; + } + + std::unique_ptr<FileMetaData> Finish() { + int64_t total_rows = 0; + std::vector<format::RowGroup> row_groups; + for (auto row_group = row_groups_.begin(); row_group != row_groups_.end(); + row_group++) { + auto rowgroup = *((*row_group).get()); + row_groups.push_back(rowgroup); + total_rows += rowgroup.num_rows; + } + metadata_->__set_num_rows(total_rows); + metadata_->__set_row_groups(row_groups); + metadata_->__set_version(properties_->version()); + metadata_->__set_created_by(properties_->created_by()); + parquet::schema::SchemaFlattener flattener( + static_cast<parquet::schema::GroupNode*>(schema_->schema().get()), + &metadata_->schema); + flattener.Flatten(); + auto file_meta_data = std::unique_ptr<FileMetaData>(new FileMetaData()); + file_meta_data->impl_->metadata_ = std::move(metadata_); + file_meta_data->impl_->InitSchema(); + return file_meta_data; + } + + protected: + std::unique_ptr<format::FileMetaData> metadata_; + + private: + const std::shared_ptr<WriterProperties> properties_; + std::vector<std::unique_ptr<format::RowGroup>> row_groups_; + std::vector<std::unique_ptr<RowGroupMetaDataBuilder>> row_group_builders_; + const SchemaDescriptor* schema_; +}; + +std::unique_ptr<FileMetaDataBuilder> FileMetaDataBuilder::Make( + const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props) { + return std::unique_ptr<FileMetaDataBuilder>(new FileMetaDataBuilder(schema, props)); +} + +FileMetaDataBuilder::FileMetaDataBuilder( + const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props) + : impl_{std::unique_ptr<FileMetaDataBuilderImpl>( + new FileMetaDataBuilderImpl(schema, props))} {} + +FileMetaDataBuilder::~FileMetaDataBuilder() {} + +RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { + return impl_->AppendRowGroup(); +} + +std::unique_ptr<FileMetaData> FileMetaDataBuilder::Finish() { + return impl_->Finish(); +} + +} // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/metadata.h ---------------------------------------------------------------------- diff --git a/src/parquet/file/metadata.h b/src/parquet/file/metadata.h new file mode 100644 index 0000000..c35f82f --- /dev/null +++ b/src/parquet/file/metadata.h @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_FILE_METADATA_H +#define PARQUET_FILE_METADATA_H + +#include <string> +#include <vector> +#include <set> + +#include "parquet/column/properties.h" +#include "parquet/compression/codec.h" +#include "parquet/schema/descriptor.h" +#include "parquet/types.h" +#include "parquet/util/output.h" +#include "parquet/util/visibility.h" + +namespace parquet { + +// ColumnStatistics does not own the min/max values +struct ColumnStatistics { + int64_t null_count; + int64_t distinct_count; + const std::string* min; + const std::string* max; +}; + +class PARQUET_EXPORT ColumnChunkMetaData { + public: + // API convenience to get a MetaData accessor + static std::unique_ptr<ColumnChunkMetaData> Make(const uint8_t* metadata); + + ~ColumnChunkMetaData(); + + // column chunk + int64_t file_offset() const; + // parameter is only used when a dataset is spread across multiple files + const std::string& file_path() const; + // column metadata + Type::type type() const; + int64_t num_values() const; + std::shared_ptr<schema::ColumnPath> path_in_schema() const; + bool is_stats_set() const; + const ColumnStatistics& statistics() const; + Compression::type compression() const; + const std::vector<Encoding::type>& encodings() const; + int64_t has_dictionary_page() const; + int64_t dictionary_page_offset() const; + int64_t data_page_offset() const; + int64_t index_page_offset() const; + int64_t total_compressed_size() const; + int64_t total_uncompressed_size() const; + + private: + explicit ColumnChunkMetaData(const uint8_t* metadata); + // PIMPL Idiom + class ColumnChunkMetaDataImpl; + std::unique_ptr<ColumnChunkMetaDataImpl> impl_; +}; + +class PARQUET_EXPORT RowGroupMetaData { + public: + // API convenience to get a MetaData accessor + static std::unique_ptr<RowGroupMetaData> Make(const uint8_t* metadata); + + ~RowGroupMetaData(); + + // row-group metadata + int num_columns() const; + int64_t num_rows() const; + int64_t total_byte_size() const; + std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) const; + + private: + explicit RowGroupMetaData(const uint8_t* metadata); + // PIMPL Idiom + class RowGroupMetaDataImpl; + std::unique_ptr<RowGroupMetaDataImpl> impl_; +}; + +class FileMetaDataBuilder; + +class PARQUET_EXPORT FileMetaData { + public: + // API convenience to get a MetaData accessor + static std::unique_ptr<FileMetaData> Make( + const uint8_t* serialized_metadata, uint32_t* metadata_len); + + ~FileMetaData(); + + // file metadata + int num_columns() const; + int64_t num_rows() const; + int num_row_groups() const; + int32_t version() const; + const std::string& created_by() const; + int num_schema_elements() const; + std::unique_ptr<RowGroupMetaData> RowGroup(int i) const; + + void WriteTo(OutputStream* dst); + + // Return const-pointer to make it clear that this object is not to be copied + const SchemaDescriptor* schema_descriptor() const; + + private: + friend FileMetaDataBuilder; + explicit FileMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len); + // PIMPL Idiom + FileMetaData(); + class FileMetaDataImpl; + std::unique_ptr<FileMetaDataImpl> impl_; +}; + +// Builder API +class PARQUET_EXPORT ColumnChunkMetaDataBuilder { + public: + // API convenience to get a MetaData reader + static std::unique_ptr<ColumnChunkMetaDataBuilder> Make( + const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column, + uint8_t* contents); + + ~ColumnChunkMetaDataBuilder(); + + // column chunk + // Used when a dataset is spread across multiple files + void set_file_path(const std::string& path); + // column metadata + // ownership of min/max is with ColumnChunkMetadata + void SetStatistics(const ColumnStatistics& stats); + + // commit the metadata + void Finish(int64_t num_values, int64_t dictonary_page_offset, + int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, + int64_t uncompressed_size, bool dictionary_fallback); + + private: + explicit ColumnChunkMetaDataBuilder(const std::shared_ptr<WriterProperties>& props, + const ColumnDescriptor* column, uint8_t* contents); + // PIMPL Idiom + class ColumnChunkMetaDataBuilderImpl; + std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_; +}; + +class PARQUET_EXPORT RowGroupMetaDataBuilder { + public: + // API convenience to get a MetaData reader + static std::unique_ptr<RowGroupMetaDataBuilder> Make( + const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_, + uint8_t* contents); + + ~RowGroupMetaDataBuilder(); + + ColumnChunkMetaDataBuilder* NextColumnChunk(); + + // commit the metadata + void Finish(int64_t num_rows); + + private: + explicit RowGroupMetaDataBuilder(const std::shared_ptr<WriterProperties>& props, + const SchemaDescriptor* schema_, uint8_t* contents); + // PIMPL Idiom + class RowGroupMetaDataBuilderImpl; + std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_; +}; + +class PARQUET_EXPORT FileMetaDataBuilder { + public: + // API convenience to get a MetaData reader + static std::unique_ptr<FileMetaDataBuilder> Make( + const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props); + + ~FileMetaDataBuilder(); + + RowGroupMetaDataBuilder* AppendRowGroup(); + + // commit the metadata + std::unique_ptr<FileMetaData> Finish(); + + private: + explicit FileMetaDataBuilder( + const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props); + // PIMPL Idiom + class FileMetaDataBuilderImpl; + std::unique_ptr<FileMetaDataBuilderImpl> impl_; +}; + +} // namespace parquet + +#endif // PARQUET_FILE_METADATA_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/reader-internal.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc index 9e592b5..5c1bc37 100644 --- a/src/parquet/file/reader-internal.cc +++ b/src/parquet/file/reader-internal.cc @@ -141,76 +141,26 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() { return std::shared_ptr<Page>(nullptr); } -// ---------------------------------------------------------------------- -// SerializedRowGroup - -int64_t SerializedRowGroup::num_rows() const { - return metadata_->num_rows; -} - -int SerializedRowGroup::num_columns() const { - return metadata_->columns.size(); +const RowGroupMetaData* SerializedRowGroup::metadata() const { + return row_group_metadata_.get(); } std::unique_ptr<PageReader> SerializedRowGroup::GetColumnPageReader(int i) { // Read column chunk from the file - const format::ColumnChunk& col = metadata_->columns[i]; + auto col = row_group_metadata_->ColumnChunk(i); - int64_t col_start = col.meta_data.data_page_offset; - if (col.meta_data.__isset.dictionary_page_offset && - col_start > col.meta_data.dictionary_page_offset) { - col_start = col.meta_data.dictionary_page_offset; + int64_t col_start = col->data_page_offset(); + if (col->has_dictionary_page() && col_start > col->dictionary_page_offset()) { + col_start = col->dictionary_page_offset(); } - int64_t bytes_to_read = col.meta_data.total_compressed_size; + int64_t bytes_to_read = col->total_compressed_size(); std::unique_ptr<InputStream> stream; stream = properties_.GetStream(source_, col_start, bytes_to_read); return std::unique_ptr<PageReader>(new SerializedPageReader( - std::move(stream), FromThrift(col.meta_data.codec), properties_.allocator())); -} - -RowGroupStatistics SerializedRowGroup::GetColumnStats(int i) const { - const format::ColumnMetaData& meta_data = metadata_->columns[i].meta_data; - - RowGroupStatistics result; - result.num_values = meta_data.num_values; - result.null_count = meta_data.statistics.null_count; - result.distinct_count = meta_data.statistics.distinct_count; - result.max = &meta_data.statistics.max; - result.min = &meta_data.statistics.min; - return result; -} - -bool SerializedRowGroup::IsColumnStatsSet(int i) const { - const format::ColumnMetaData& meta_data = metadata_->columns[i].meta_data; - return meta_data.__isset.statistics; -} - -Compression::type SerializedRowGroup::GetColumnCompression(int i) const { - const format::ColumnMetaData& meta_data = metadata_->columns[i].meta_data; - return FromThrift(meta_data.codec); -} - -std::vector<Encoding::type> SerializedRowGroup::GetColumnEncodings(int i) const { - const format::ColumnMetaData& meta_data = metadata_->columns[i].meta_data; - - std::vector<Encoding::type> encodings; - for (auto encoding : meta_data.encodings) { - encodings.push_back(FromThrift(encoding)); - } - return encodings; -} - -int64_t SerializedRowGroup::GetColumnUnCompressedSize(int i) const { - const format::ColumnMetaData& meta_data = metadata_->columns[i].meta_data; - return meta_data.total_uncompressed_size; -} - -int64_t SerializedRowGroup::GetColumnCompressedSize(int i) const { - const format::ColumnMetaData& meta_data = metadata_->columns[i].meta_data; - return meta_data.total_compressed_size; + std::move(stream), col->compression(), properties_.allocator())); } // ---------------------------------------------------------------------- @@ -242,23 +192,15 @@ SerializedFile::~SerializedFile() { } std::shared_ptr<RowGroupReader> SerializedFile::GetRowGroup(int i) { - std::unique_ptr<SerializedRowGroup> contents( - new SerializedRowGroup(source_.get(), &metadata_.row_groups[i], properties_)); + std::unique_ptr<SerializedRowGroup> contents(new SerializedRowGroup( + source_.get(), std::move(file_metadata_->RowGroup(i)), properties_)); return std::make_shared<RowGroupReader>( - &schema_, std::move(contents), properties_.allocator()); -} - -int64_t SerializedFile::num_rows() const { - return metadata_.num_rows; -} - -int SerializedFile::num_columns() const { - return schema_.num_columns(); + file_metadata_->schema_descriptor(), std::move(contents), properties_.allocator()); } -int SerializedFile::num_row_groups() const { - return metadata_.row_groups.size(); +const FileMetaData* SerializedFile::metadata() const { + return file_metadata_.get(); } SerializedFile::SerializedFile(std::unique_ptr<RandomAccessSource> source, @@ -293,10 +235,8 @@ void SerializedFile::ParseMetaData() { if (bytes_read != metadata_len) { throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } - DeserializeThriftMsg(&metadata_buffer[0], &metadata_len, &metadata_); - schema::FlatSchemaConverter converter(&metadata_.schema[0], metadata_.schema.size()); - schema_.Init(converter.Convert()); + file_metadata_ = FileMetaData::Make(&metadata_buffer[0], &metadata_len); } } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/reader-internal.h ---------------------------------------------------------------------- diff --git a/src/parquet/file/reader-internal.h b/src/parquet/file/reader-internal.h index 0c3c9f7..48e2daf 100644 --- a/src/parquet/file/reader-internal.h +++ b/src/parquet/file/reader-internal.h @@ -25,6 +25,7 @@ #include "parquet/column/page.h" #include "parquet/column/properties.h" #include "parquet/compression/codec.h" +#include "parquet/file/metadata.h" #include "parquet/file/reader.h" #include "parquet/thrift/parquet_types.h" #include "parquet/types.h" @@ -69,23 +70,17 @@ class SerializedPageReader : public PageReader { // RowGroupReader::Contents implementation for the Parquet file specification class SerializedRowGroup : public RowGroupReader::Contents { public: - SerializedRowGroup(RandomAccessSource* source, const format::RowGroup* metadata, - ReaderProperties props) - : source_(source), metadata_(metadata), properties_(props) {} + SerializedRowGroup(RandomAccessSource* source, + std::unique_ptr<RowGroupMetaData> metadata, const ReaderProperties props) + : source_(source), row_group_metadata_(std::move(metadata)), properties_(props) {} + + virtual const RowGroupMetaData* metadata() const; - virtual int num_columns() const; - virtual int64_t num_rows() const; virtual std::unique_ptr<PageReader> GetColumnPageReader(int i); - virtual RowGroupStatistics GetColumnStats(int i) const; - virtual bool IsColumnStatsSet(int i) const; - virtual Compression::type GetColumnCompression(int i) const; - virtual std::vector<Encoding::type> GetColumnEncodings(int i) const; - virtual int64_t GetColumnCompressedSize(int i) const; - virtual int64_t GetColumnUnCompressedSize(int i) const; private: RandomAccessSource* source_; - const format::RowGroup* metadata_; + std::unique_ptr<RowGroupMetaData> row_group_metadata_; ReaderProperties properties_; }; @@ -103,9 +98,7 @@ class SerializedFile : public ParquetFileReader::Contents { ReaderProperties props = default_reader_properties()); virtual void Close(); virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i); - virtual int64_t num_rows() const; - virtual int num_columns() const; - virtual int num_row_groups() const; + virtual const FileMetaData* metadata() const; virtual ~SerializedFile(); private: @@ -114,7 +107,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::unique_ptr<RandomAccessSource> source, ReaderProperties props); std::unique_ptr<RandomAccessSource> source_; - format::FileMetaData metadata_; + std::unique_ptr<FileMetaData> file_metadata_; ReaderProperties properties_; void ParseMetaData(); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/reader.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/reader.cc b/src/parquet/file/reader.cc index aabcc2b..b6de168 100644 --- a/src/parquet/file/reader.cc +++ b/src/parquet/file/reader.cc @@ -30,6 +30,7 @@ #include "parquet/exception.h" #include "parquet/file/reader-internal.h" #include "parquet/util/input.h" +#include "parquet/util/logging.h" #include "parquet/types.h" using std::string; @@ -44,50 +45,24 @@ RowGroupReader::RowGroupReader(const SchemaDescriptor* schema, std::unique_ptr<Contents> contents, MemoryAllocator* allocator) : schema_(schema), contents_(std::move(contents)), allocator_(allocator) {} -int RowGroupReader::num_columns() const { - return contents_->num_columns(); -} - -int64_t RowGroupReader::num_rows() const { - return contents_->num_rows(); -} - std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) { - // TODO: boundschecking + DCHECK(i < schema_->num_columns()) << "The RowGroup only has " << schema_->num_columns() + << "columns, requested column: " << i; const ColumnDescriptor* descr = schema_->Column(i); std::unique_ptr<PageReader> page_reader = contents_->GetColumnPageReader(i); return ColumnReader::Make(descr, std::move(page_reader), allocator_); } -RowGroupStatistics RowGroupReader::GetColumnStats(int i) const { - return contents_->GetColumnStats(i); -} - -bool RowGroupReader::IsColumnStatsSet(int i) const { - return contents_->IsColumnStatsSet(i); -} - -Compression::type RowGroupReader::GetColumnCompression(int i) const { - return contents_->GetColumnCompression(i); -} - -std::vector<Encoding::type> RowGroupReader::GetColumnEncodings(int i) const { - return contents_->GetColumnEncodings(i); -} - -int64_t RowGroupReader::GetColumnUnCompressedSize(int i) const { - return contents_->GetColumnUnCompressedSize(i); -} - -int64_t RowGroupReader::GetColumnCompressedSize(int i) const { - return contents_->GetColumnCompressedSize(i); +// Returns the rowgroup metadata +const RowGroupMetaData* RowGroupReader::metadata() const { + return contents_->metadata(); } // ---------------------------------------------------------------------- // ParquetFileReader public API -ParquetFileReader::ParquetFileReader() : schema_(nullptr) {} +ParquetFileReader::ParquetFileReader() {} ParquetFileReader::~ParquetFileReader() { Close(); } @@ -117,33 +92,20 @@ std::unique_ptr<ParquetFileReader> ParquetFileReader::OpenFile( void ParquetFileReader::Open(std::unique_ptr<ParquetFileReader::Contents> contents) { contents_ = std::move(contents); - schema_ = contents_->schema(); } void ParquetFileReader::Close() { if (contents_) { contents_->Close(); } } -int ParquetFileReader::num_row_groups() const { - return contents_->num_row_groups(); -} - -int64_t ParquetFileReader::num_rows() const { - return contents_->num_rows(); -} - -int ParquetFileReader::num_columns() const { - return schema_->num_columns(); +const FileMetaData* ParquetFileReader::metadata() const { + return contents_->metadata(); } std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) { - if (i >= num_row_groups()) { - std::stringstream ss; - ss << "The file only has " << num_row_groups() - << "row groups, requested reader for: " << i; - throw ParquetException(ss.str()); - } - + DCHECK(i < metadata()->num_row_groups()) << "The file only has " + << metadata()->num_row_groups() + << "row groups, requested reader for: " << i; return contents_->GetRowGroup(i); } @@ -155,43 +117,57 @@ std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) { void ParquetFileReader::DebugPrint( std::ostream& stream, std::list<int> selected_columns, bool print_values) { + const FileMetaData* file_metadata = metadata(); + stream << "File statistics:\n"; - stream << "Total rows: " << num_rows() << "\n"; + stream << "Version: " << file_metadata->version() << "\n"; + stream << "Created By: " << file_metadata->created_by() << "\n"; + stream << "Total rows: " << file_metadata->num_rows() << "\n"; + stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n"; + stream << "Number of Real Columns: " + << file_metadata->schema_descriptor()->group()->field_count() << "\n"; if (selected_columns.size() == 0) { - for (int i = 0; i < num_columns(); i++) { + for (int i = 0; i < file_metadata->num_columns(); i++) { selected_columns.push_back(i); } } else { for (auto i : selected_columns) { - if (i < 0 || i >= num_columns()) { + if (i < 0 || i >= file_metadata->num_columns()) { throw ParquetException("Selected column is out of range"); } } } + stream << "Number of Columns: " << file_metadata->num_columns() << "\n"; + stream << "Number of Selected Columns: " << selected_columns.size() << "\n"; for (auto i : selected_columns) { - const ColumnDescriptor* descr = schema_->Column(i); + const ColumnDescriptor* descr = file_metadata->schema_descriptor()->Column(i); stream << "Column " << i << ": " << descr->name() << " (" << type_to_string(descr->physical_type()) << ")" << std::endl; } - for (int r = 0; r < num_row_groups(); ++r) { + for (int r = 0; r < file_metadata->num_row_groups(); ++r) { stream << "--- Row Group " << r << " ---\n"; auto group_reader = RowGroup(r); + std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); + + stream << "--- Total Bytes " << group_metadata->total_byte_size() << " ---\n"; + stream << " rows: " << group_metadata->num_rows() << "---\n"; // Print column metadata for (auto i : selected_columns) { - RowGroupStatistics stats = group_reader->GetColumnStats(i); + auto column_chunk = group_metadata->ColumnChunk(i); + const ColumnStatistics stats = column_chunk->statistics(); - const ColumnDescriptor* descr = schema_->Column(i); + const ColumnDescriptor* descr = file_metadata->schema_descriptor()->Column(i); stream << "Column " << i << std::endl - << " rows: " << group_reader->num_rows() << ", values: " << stats.num_values - << ", null values: " << stats.null_count - << ", distinct values: " << stats.distinct_count << std::endl; - if (group_reader->IsColumnStatsSet(i)) { - stream << " max: " << FormatStatValue(descr->physical_type(), stats.max->c_str()) + << ", values: " << column_chunk->num_values(); + if (column_chunk->is_stats_set()) { + stream << ", null values: " << stats.null_count + << ", distinct values: " << stats.distinct_count << std::endl + << " max: " << FormatStatValue(descr->physical_type(), stats.max->c_str()) << ", min: " << FormatStatValue(descr->physical_type(), stats.min->c_str()); } else { @@ -199,15 +175,16 @@ void ParquetFileReader::DebugPrint( } stream << std::endl << " compression: " - << compression_to_string(group_reader->GetColumnCompression(i)) + << compression_to_string(column_chunk->compression()) << ", encodings: "; - for (auto encoding : group_reader->GetColumnEncodings(i)) { + for (auto encoding : column_chunk->encodings()) { stream << encoding_to_string(encoding) << " "; } stream << std::endl - << " uncompressed size: " << group_reader->GetColumnUnCompressedSize(i) - << ", compressed size: " << group_reader->GetColumnCompressedSize(i) - << std::endl; + << " uncompressed size: " + << column_chunk->total_uncompressed_size() + << ", compressed size: " + << column_chunk->total_compressed_size() << std::endl; } if (!print_values) { continue; } @@ -225,7 +202,8 @@ void ParquetFileReader::DebugPrint( ss << "%-" << COL_WIDTH << "s"; std::string fmt = ss.str(); - snprintf(buffer, bufsize, fmt.c_str(), column_schema(i)->name().c_str()); + snprintf(buffer, bufsize, fmt.c_str(), + file_metadata->schema_descriptor()->Column(i)->name().c_str()); stream << buffer; // This is OK in this method as long as the RowGroupReader does not get http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/reader.h ---------------------------------------------------------------------- diff --git a/src/parquet/file/reader.h b/src/parquet/file/reader.h index 8e0d26f..baa3e30 100644 --- a/src/parquet/file/reader.h +++ b/src/parquet/file/reader.h @@ -27,6 +27,7 @@ #include "parquet/column/page.h" #include "parquet/column/properties.h" +#include "parquet/file/metadata.h" #include "parquet/schema/descriptor.h" #include "parquet/util/visibility.h" @@ -35,54 +36,31 @@ namespace parquet { class ColumnReader; class RandomAccessSource; -struct RowGroupStatistics { - int64_t num_values; - int64_t null_count; - int64_t distinct_count; - const std::string* min; - const std::string* max; -}; - class PARQUET_EXPORT RowGroupReader { public: // Forward declare the PIMPL struct Contents { - virtual int num_columns() const = 0; - virtual int64_t num_rows() const = 0; + virtual ~Contents() {} virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0; - virtual RowGroupStatistics GetColumnStats(int i) const = 0; - virtual bool IsColumnStatsSet(int i) const = 0; - virtual Compression::type GetColumnCompression(int i) const = 0; - virtual std::vector<Encoding::type> GetColumnEncodings(int i) const = 0; - virtual int64_t GetColumnCompressedSize(int i) const = 0; - virtual int64_t GetColumnUnCompressedSize(int i) const = 0; + virtual const RowGroupMetaData* metadata() const = 0; }; RowGroupReader(const SchemaDescriptor* schema, std::unique_ptr<Contents> contents, MemoryAllocator* allocator); + // Returns the rowgroup metadata + const RowGroupMetaData* metadata() const; + // Construct a ColumnReader for the indicated row group-relative // column. Ownership is shared with the RowGroupReader. std::shared_ptr<ColumnReader> Column(int i); - int num_columns() const; - int64_t num_rows() const; - - RowGroupStatistics GetColumnStats(int i) const; - bool IsColumnStatsSet(int i) const; - Compression::type GetColumnCompression(int i) const; - std::vector<Encoding::type> GetColumnEncodings(int i) const; - int64_t GetColumnCompressedSize(int i) const; - int64_t GetColumnUnCompressedSize(int i) const; private: - // Owned by the parent ParquetFileReader const SchemaDescriptor* schema_; - // PIMPL idiom // This is declared in the .cc file so that we can hide compiled Thrift // headers from the public API and also more easily create test fixtures. std::unique_ptr<Contents> contents_; - MemoryAllocator* allocator_; }; @@ -93,16 +71,8 @@ class PARQUET_EXPORT ParquetFileReader { virtual ~Contents() {} // Perform any cleanup associated with the file contents virtual void Close() = 0; - virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0; - - virtual int64_t num_rows() const = 0; - virtual int num_columns() const = 0; - virtual int num_row_groups() const = 0; - - // Return const-poitner to make it clear that this object is not to be copied - const SchemaDescriptor* schema() const { return &schema_; } - SchemaDescriptor schema_; + virtual const FileMetaData* metadata() const = 0; }; ParquetFileReader(); @@ -122,14 +92,8 @@ class PARQUET_EXPORT ParquetFileReader { // The RowGroupReader is owned by the FileReader std::shared_ptr<RowGroupReader> RowGroup(int i); - int num_columns() const; - int64_t num_rows() const; - int num_row_groups() const; - - // Returns the file schema descriptor - const SchemaDescriptor* descr() { return schema_; } - - const ColumnDescriptor* column_schema(int i) const { return schema_->Column(i); } + // Returns the file metadata + const FileMetaData* metadata() const; void DebugPrint( std::ostream& stream, std::list<int> selected_columns, bool print_values = true); @@ -139,9 +103,6 @@ class PARQUET_EXPORT ParquetFileReader { // This is declared in the .cc file so that we can hide compiled Thrift // headers from the public API and also more easily create test fixtures. std::unique_ptr<Contents> contents_; - - // The SchemaDescriptor is provided by the Contents impl - const SchemaDescriptor* schema_; }; } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/reader-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc index 9a8fcf6..2e28c80 100644 --- a/src/parquet/reader-test.cc +++ b/src/parquet/reader-test.cc @@ -68,11 +68,11 @@ TEST_F(TestAllTypesPlain, TestBatchRead) { int32_t values[4]; // This file only has 8 rows - ASSERT_EQ(8, reader_->num_rows()); + ASSERT_EQ(8, reader_->metadata()->num_rows()); // This file only has 1 row group - ASSERT_EQ(1, reader_->num_row_groups()); + ASSERT_EQ(1, reader_->metadata()->num_row_groups()); // This row group must have 8 rows - ASSERT_EQ(8, group->num_rows()); + ASSERT_EQ(8, group->metadata()->num_rows()); ASSERT_TRUE(col->HasNext()); int64_t values_read; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/util/bpacking.h ---------------------------------------------------------------------- diff --git a/src/parquet/util/bpacking.h b/src/parquet/util/bpacking.h index d9ae531..f407538 100644 --- a/src/parquet/util/bpacking.h +++ b/src/parquet/util/bpacking.h @@ -13,6 +13,8 @@ #ifndef PARQUET_UTIL_BPACKING_H #define PARQUET_UTIL_BPACKING_H +#include <stdexcept> + namespace parquet { inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {
