Repository: parquet-cpp Updated Branches: refs/heads/master 5e59bc5c6 -> 1a6d22b7d
PARQUET-793: Do not return incorrect statistics Author: Deepak Majeti <[email protected]> Closes #239 from majetideepak/FixStats and squashes the following commits: 948c9b9 [Deepak Majeti] fix failing test ea0693f [Deepak Majeti] review comments dbb8569 [Deepak Majeti] minor fix 75d392a [Deepak Majeti] format e89f85f [Deepak Majeti] stats test and comments 800e239 [Deepak Majeti] added tests 92f31cd [Deepak Majeti] Return only Signed stats feeeb88 [Deepak Majeti] Update Version and add checkCorrectStatistics Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/1a6d22b7 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/1a6d22b7 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/1a6d22b7 Branch: refs/heads/master Commit: 1a6d22b7d9e430d3e4ba93fcf63431f0bc2daf69 Parents: 5e59bc5 Author: Deepak Majeti <[email protected]> Authored: Sat Feb 11 19:33:38 2017 -0500 Committer: Wes McKinney <[email protected]> Committed: Sat Feb 11 19:33:38 2017 -0500 ---------------------------------------------------------------------- src/parquet/column/statistics-test.cc | 46 +++++++ src/parquet/file/file-metadata-test.cc | 24 +++- src/parquet/file/metadata.cc | 190 ++++++++++++++++++++++------ src/parquet/file/metadata.h | 87 ++++++++----- src/parquet/file/reader-internal.cc | 4 +- 5 files changed, 271 insertions(+), 80 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1a6d22b7/src/parquet/column/statistics-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/statistics-test.cc b/src/parquet/column/statistics-test.cc index dcaad45..d631d98 100644 --- a/src/parquet/column/statistics-test.cc +++ b/src/parquet/column/statistics-test.cc @@ -32,6 +32,7 @@ #include "parquet/file/reader.h" #include "parquet/file/writer.h" #include "parquet/schema.h" +#include "parquet/thrift.h" #include "parquet/types.h" #include "parquet/util/memory.h" @@ -167,6 +168,7 @@ class TestRowGroupStatistics : public PrimitiveTypedTest<TestType> { auto file_reader = ParquetFileReader::Open(source); auto rg_reader = file_reader->RowGroup(0); auto column_chunk = rg_reader->metadata()->ColumnChunk(0); + if (!column_chunk->is_stats_set()) return; std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics(); // check values after serialization + deserialization ASSERT_EQ(null_count, stats->null_count()); @@ -308,5 +310,49 @@ TYPED_TEST(TestNumericRowGroupStatistics, Merge) { this->TestMerge(); } +TEST(CorruptStatistics, Basics) { + ApplicationVersion version("parquet-mr version 1.8.0"); + SchemaDescriptor schema; + schema::NodePtr node; + std::vector<schema::NodePtr> fields; + // Test Physical Types + fields.push_back(schema::PrimitiveNode::Make( + "col1", Repetition::OPTIONAL, Type::INT32, LogicalType::NONE)); + fields.push_back(schema::PrimitiveNode::Make( + "col2", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::NONE)); + // Test Logical Types + fields.push_back(schema::PrimitiveNode::Make( + "col3", Repetition::OPTIONAL, Type::INT32, LogicalType::DATE)); + fields.push_back(schema::PrimitiveNode::Make( + "col4", Repetition::OPTIONAL, Type::INT32, LogicalType::UINT_32)); + fields.push_back(schema::PrimitiveNode::Make("col5", Repetition::OPTIONAL, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 12)); + fields.push_back(schema::PrimitiveNode::Make( + "col6", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::UTF8)); + node = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields); + schema.Init(node); + + format::ColumnChunk col_chunk; + col_chunk.meta_data.__isset.statistics = true; + auto column_chunk1 = ColumnChunkMetaData::Make( + reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(0), &version); + ASSERT_TRUE(column_chunk1->is_stats_set()); + auto column_chunk2 = ColumnChunkMetaData::Make( + reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(1), &version); + ASSERT_FALSE(column_chunk2->is_stats_set()); + auto column_chunk3 = ColumnChunkMetaData::Make( + reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(2), &version); + ASSERT_TRUE(column_chunk3->is_stats_set()); + auto column_chunk4 = ColumnChunkMetaData::Make( + reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(3), &version); + ASSERT_FALSE(column_chunk4->is_stats_set()); + auto column_chunk5 = ColumnChunkMetaData::Make( + reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(4), &version); + ASSERT_FALSE(column_chunk5->is_stats_set()); + auto column_chunk6 = ColumnChunkMetaData::Make( + reinterpret_cast<const uint8_t*>(&col_chunk), schema.Column(5), &version); + ASSERT_FALSE(column_chunk6->is_stats_set()); +} + } // namespace test } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1a6d22b7/src/parquet/file/file-metadata-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/file-metadata-test.cc b/src/parquet/file/file-metadata-test.cc index 0edc34f..74d4406 100644 --- a/src/parquet/file/file-metadata-test.cc +++ b/src/parquet/file/file-metadata-test.cc @@ -181,13 +181,29 @@ TEST(Metadata, TestV1Version) { ASSERT_EQ(ParquetVersion::PARQUET_1_0, f_accessor->version()); } -TEST(FileVersion, Basics) { - FileMetaData::Version version("parquet-mr version 1.2.8"); +TEST(ApplicationVersion, Basics) { + ApplicationVersion version("parquet-mr version 1.7.9"); + ApplicationVersion version1("parquet-mr version 1.8.0"); + ApplicationVersion version2("parquet-cpp version 1.0.0"); + ApplicationVersion version3(""); ASSERT_EQ("parquet-mr", version.application); ASSERT_EQ(1, version.version.major); - ASSERT_EQ(2, version.version.minor); - ASSERT_EQ(8, version.version.patch); + ASSERT_EQ(7, version.version.minor); + ASSERT_EQ(9, version.version.patch); + + ASSERT_EQ("parquet-cpp", version2.application); + ASSERT_EQ(1, version2.version.major); + ASSERT_EQ(0, version2.version.minor); + ASSERT_EQ(0, version2.version.patch); + + ASSERT_EQ(true, version.VersionLt(version1)); + + ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96)); + ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32)); + ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY)); + ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY)); + ASSERT_TRUE(version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY)); } } // namespace metadata http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1a6d22b7/src/parquet/file/metadata.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc index de7c4e4..42942a5 100644 --- a/src/parquet/file/metadata.cc +++ b/src/parquet/file/metadata.cc @@ -30,6 +30,62 @@ namespace parquet { +const ApplicationVersion ApplicationVersion::PARQUET_251_FIXED_VERSION = + ApplicationVersion("parquet-mr version 1.8.0"); +const ApplicationVersion ApplicationVersion::PARQUET_816_FIXED_VERSION = + ApplicationVersion("parquet-mr version 1.2.9"); + +// Return the Sort Order of the Parquet Physical Types +SortOrder default_sort_order(Type::type primitive) { + switch (primitive) { + case Type::BOOLEAN: + case Type::INT32: + case Type::INT64: + case Type::FLOAT: + case Type::DOUBLE: + return SortOrder::SIGNED; + case Type::BYTE_ARRAY: + case Type::FIXED_LEN_BYTE_ARRAY: + case Type::INT96: // only used for timestamp, which uses unsigned values + return SortOrder::UNSIGNED; + } + return SortOrder::UNKNOWN; +} + +// Return the SortOrder of the Parquet Types using Logical or Physical Types +SortOrder get_sort_order(LogicalType::type converted, Type::type primitive) { + if (converted == LogicalType::NONE) return default_sort_order(primitive); + switch (converted) { + case LogicalType::INT_8: + case LogicalType::INT_16: + case LogicalType::INT_32: + case LogicalType::INT_64: + case LogicalType::DATE: + case LogicalType::TIME_MICROS: + case LogicalType::TIME_MILLIS: + case LogicalType::TIMESTAMP_MICROS: + case LogicalType::TIMESTAMP_MILLIS: + return SortOrder::SIGNED; + case LogicalType::UINT_8: + case LogicalType::UINT_16: + case LogicalType::UINT_32: + case LogicalType::UINT_64: + case LogicalType::ENUM: + case LogicalType::UTF8: + case LogicalType::BSON: + case LogicalType::JSON: + return SortOrder::UNSIGNED; + case LogicalType::DECIMAL: + case LogicalType::LIST: + case LogicalType::MAP: + case LogicalType::MAP_KEY_VALUE: + case LogicalType::INTERVAL: + case LogicalType::NONE: // required instead of default + return SortOrder::UNKNOWN; + } + return SortOrder::UNKNOWN; +} + template <typename DType> static std::shared_ptr<RowGroupStatistics> MakeTypedColumnStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { @@ -66,14 +122,14 @@ std::shared_ptr<RowGroupStatistics> MakeColumnStats( // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: - explicit ColumnChunkMetaDataImpl( - const format::ColumnChunk* column, const ColumnDescriptor* descr) - : column_(column), descr_(descr) { + explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, + const ColumnDescriptor* descr, const ApplicationVersion* writer_version) + : column_(column), descr_(descr), writer_version_(writer_version) { const format::ColumnMetaData& meta_data = column->meta_data; for (auto encoding : meta_data.encodings) { encodings_.push_back(FromThrift(encoding)); } - if (meta_data.__isset.statistics) { stats_ = MakeColumnStats(meta_data, descr_); } + stats_ = nullptr; } ~ColumnChunkMetaDataImpl() {} @@ -82,7 +138,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline const std::string& file_path() const { return column_->file_path; } // column metadata - inline Type::type type() { return FromThrift(column_->meta_data.type); } + inline Type::type type() const { return FromThrift(column_->meta_data.type); } inline int64_t num_values() const { return column_->meta_data.num_values; } @@ -90,9 +146,26 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return std::make_shared<schema::ColumnPath>(column_->meta_data.path_in_schema); } - inline bool is_stats_set() const { return column_->meta_data.__isset.statistics; } + // Check if statistics are set and are valid + // 1) Must be set in the metadata + // 2) Statistics must not be corrupted + // 3) parquet-mr and parquet-cpp write statistics by SIGNED order comparison. + // The statistics are corrupted if the type requires UNSIGNED order comparison. + // Eg: UTF8 + inline bool is_stats_set() const { + DCHECK(writer_version_ != nullptr); + return column_->meta_data.__isset.statistics && + writer_version_->HasCorrectStatistics(type()) && + SortOrder::SIGNED == + get_sort_order(descr_->logical_type(), descr_->physical_type()); + } - inline std::shared_ptr<RowGroupStatistics> statistics() const { return stats_; } + inline std::shared_ptr<RowGroupStatistics> statistics() const { + if (stats_ == nullptr && is_stats_set()) { + stats_ = MakeColumnStats(column_->meta_data, descr_); + } + return stats_; + } inline Compression::type compression() const { return FromThrift(column_->meta_data.codec); @@ -123,21 +196,24 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } private: - std::shared_ptr<RowGroupStatistics> stats_; + mutable std::shared_ptr<RowGroupStatistics> stats_; std::vector<Encoding::type> encodings_; const format::ColumnChunk* column_; const ColumnDescriptor* descr_; + const ApplicationVersion* writer_version_; }; -std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make( - const uint8_t* metadata, const ColumnDescriptor* descr) { - return std::unique_ptr<ColumnChunkMetaData>(new ColumnChunkMetaData(metadata, descr)); +std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(const uint8_t* metadata, + const ColumnDescriptor* descr, const ApplicationVersion* writer_version) { + return std::unique_ptr<ColumnChunkMetaData>( + new ColumnChunkMetaData(metadata, descr, writer_version)); } -ColumnChunkMetaData::ColumnChunkMetaData( - const uint8_t* metadata, const ColumnDescriptor* descr) +ColumnChunkMetaData::ColumnChunkMetaData(const uint8_t* metadata, + const ColumnDescriptor* descr, const ApplicationVersion* writer_version) : impl_{std::unique_ptr<ColumnChunkMetaDataImpl>(new ColumnChunkMetaDataImpl( - reinterpret_cast<const format::ColumnChunk*>(metadata), descr))} {} + reinterpret_cast<const format::ColumnChunk*>(metadata), descr, + writer_version))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -205,9 +281,9 @@ int64_t ColumnChunkMetaData::total_compressed_size() const { // row-group metadata class RowGroupMetaData::RowGroupMetaDataImpl { public: - explicit RowGroupMetaDataImpl( - const format::RowGroup* row_group, const SchemaDescriptor* schema) - : row_group_(row_group), schema_(schema) {} + explicit RowGroupMetaDataImpl(const format::RowGroup* row_group, + const SchemaDescriptor* schema, const ApplicationVersion* writer_version) + : row_group_(row_group), schema_(schema), writer_version_(writer_version) {} ~RowGroupMetaDataImpl() {} inline int num_columns() const { return row_group_->columns.size(); } @@ -226,23 +302,27 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make( - reinterpret_cast<const uint8_t*>(&row_group_->columns[i]), schema_->Column(i)); + reinterpret_cast<const uint8_t*>(&row_group_->columns[i]), schema_->Column(i), + writer_version_); } private: const format::RowGroup* row_group_; const SchemaDescriptor* schema_; + const ApplicationVersion* writer_version_; }; -std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make( - const uint8_t* metadata, const SchemaDescriptor* schema) { - return std::unique_ptr<RowGroupMetaData>(new RowGroupMetaData(metadata, schema)); +std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(const uint8_t* metadata, + const SchemaDescriptor* schema, const ApplicationVersion* writer_version) { + return std::unique_ptr<RowGroupMetaData>( + new RowGroupMetaData(metadata, schema, writer_version)); } -RowGroupMetaData::RowGroupMetaData( - const uint8_t* metadata, const SchemaDescriptor* schema) +RowGroupMetaData::RowGroupMetaData(const uint8_t* metadata, + const SchemaDescriptor* schema, const ApplicationVersion* writer_version) : impl_{std::unique_ptr<RowGroupMetaDataImpl>(new RowGroupMetaDataImpl( - reinterpret_cast<const format::RowGroup*>(metadata), schema))} {} + reinterpret_cast<const format::RowGroup*>(metadata), schema, writer_version))} { +} RowGroupMetaData::~RowGroupMetaData() {} int RowGroupMetaData::num_columns() const { @@ -277,9 +357,9 @@ class FileMetaData::FileMetaDataImpl { metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { - writer_version_ = FileMetaData::Version(metadata_->created_by); + writer_version_ = ApplicationVersion(metadata_->created_by); } else { - writer_version_ = FileMetaData::Version("unknown 0.0.0"); + writer_version_ = ApplicationVersion("unknown 0.0.0"); } InitSchema(); @@ -294,7 +374,7 @@ class FileMetaData::FileMetaDataImpl { inline const std::string& created_by() const { return metadata_->created_by; } inline int num_schema_elements() const { return metadata_->schema.size(); } - const FileMetaData::Version& writer_version() const { return writer_version_; } + const ApplicationVersion& writer_version() const { return writer_version_; } void WriteTo(OutputStream* dst) { SerializeThriftMsg(metadata_.get(), 1024, dst); } @@ -306,7 +386,8 @@ class FileMetaData::FileMetaDataImpl { throw ParquetException(ss.str()); } return RowGroupMetaData::Make( - reinterpret_cast<const uint8_t*>(&metadata_->row_groups[i]), &schema_); + reinterpret_cast<const uint8_t*>(&metadata_->row_groups[i]), &schema_, + &writer_version_); } const SchemaDescriptor* schema() const { return &schema_; } @@ -321,7 +402,7 @@ class FileMetaData::FileMetaDataImpl { schema_.Init(converter.Convert()); } SchemaDescriptor schema_; - FileMetaData::Version writer_version_; + ApplicationVersion writer_version_; }; std::shared_ptr<FileMetaData> FileMetaData::Make( @@ -372,7 +453,7 @@ ParquetVersion::type FileMetaData::version() const { return ParquetVersion::PARQUET_1_0; } -const FileMetaData::Version& FileMetaData::writer_version() const { +const ApplicationVersion& FileMetaData::writer_version() const { return impl_->writer_version(); } @@ -392,7 +473,7 @@ void FileMetaData::WriteTo(OutputStream* dst) { return impl_->WriteTo(dst); } -FileMetaData::Version::Version(const std::string& created_by) { +ApplicationVersion::ApplicationVersion(const std::string& created_by) { namespace ba = boost::algorithm; std::string created_by_lower = created_by; @@ -423,18 +504,45 @@ FileMetaData::Version::Version(const std::string& created_by) { } } -bool FileMetaData::Version::VersionLt(int major, int minor, int patch) const { - if (version.major < major) return true; - if (version.major > major) return false; - DCHECK_EQ(version.major, major); - if (version.minor < minor) return true; - if (version.minor > minor) return false; - DCHECK_EQ(version.minor, minor); - return version.patch < patch; +bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version) const { + if (application != other_version.application) return false; + + if (version.major < other_version.version.major) return true; + if (version.major > other_version.version.major) return false; + DCHECK_EQ(version.major, other_version.version.major); + if (version.minor < other_version.version.minor) return true; + if (version.minor > other_version.version.minor) return false; + DCHECK_EQ(version.minor, other_version.version.minor); + return version.patch < other_version.version.patch; +} + +bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) const { + return application == other_version.application && + version.major == other_version.version.major && + version.minor == other_version.version.minor && + version.patch == other_version.version.patch; } -bool FileMetaData::Version::VersionEq(int major, int minor, int patch) const { - return version.major == major && version.minor == minor && version.patch == patch; +// Reference: +// parquet-mr/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java +// PARQUET-686 has more disussion on statistics +bool ApplicationVersion::HasCorrectStatistics(Type::type col_type) const { + // None of the current tools write INT96 Statistics correctly + if (col_type == Type::INT96) return false; + + // Statistics of other types are OK + if (col_type != Type::FIXED_LEN_BYTE_ARRAY && col_type != Type::BYTE_ARRAY) { + return true; + } + + // created_by is not populated, which could have been caused by + // parquet-mr during the same time as PARQUET-251, see PARQUET-297 + if (application == "unknown") { return true; } + + // PARQUET-251 + if (VersionLt(PARQUET_251_FIXED_VERSION)) { return false; } + + return true; } // MetaData Builders http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1a6d22b7/src/parquet/file/metadata.h ---------------------------------------------------------------------- diff --git a/src/parquet/file/metadata.h b/src/parquet/file/metadata.h index eab7fc6..97793a1 100644 --- a/src/parquet/file/metadata.h +++ b/src/parquet/file/metadata.h @@ -32,11 +32,56 @@ namespace parquet { +// Reference: +// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/ +// format/converter/ParquetMetadataConverter.java +// Sort order for page and column statistics. Types are associated with sort +// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are +// aggregated using a sort order. As of parquet-format version 2.3.1, the +// order used to aggregate stats is always SIGNED and is not stored in the +// Parquet file. These stats are discarded for types that need unsigned. +// See PARQUET-686. +enum SortOrder { SIGNED, UNSIGNED, UNKNOWN }; + +class ApplicationVersion { + public: + /// Known Versions with Issues + static const ApplicationVersion PARQUET_251_FIXED_VERSION; + static const ApplicationVersion PARQUET_816_FIXED_VERSION; + + /// Application that wrote the file. e.g. "IMPALA" + std::string application; + + /// Version of the application that wrote the file, expressed in three parts + /// (<major>.<minor>.<patch>). Unspecified parts default to 0, and extra parts are + /// ignored. e.g.: + /// "1.2.3" => {1, 2, 3} + /// "1.2" => {1, 2, 0} + /// "1.2-cdh5" => {1, 2, 0} + struct { + int major; + int minor; + int patch; + } version; + + ApplicationVersion() {} + explicit ApplicationVersion(const std::string& created_by); + + /// Returns true if version is strictly less than other_version + bool VersionLt(const ApplicationVersion& other_version) const; + + /// Returns true if version is strictly less than other_version + bool VersionEq(const ApplicationVersion& other_version) const; + + // Checks if the Version has the correct statistics for a given column + bool HasCorrectStatistics(Type::type primitive) const; +}; + class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor - static std::unique_ptr<ColumnChunkMetaData> Make( - const uint8_t* metadata, const ColumnDescriptor* descr); + static std::unique_ptr<ColumnChunkMetaData> Make(const uint8_t* metadata, + const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULL); ~ColumnChunkMetaData(); @@ -60,7 +105,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t total_uncompressed_size() const; private: - explicit ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr); + explicit ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr, + const ApplicationVersion* writer_version = NULL); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr<ColumnChunkMetaDataImpl> impl_; @@ -69,8 +115,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { class PARQUET_EXPORT RowGroupMetaData { public: // API convenience to get a MetaData accessor - static std::unique_ptr<RowGroupMetaData> Make( - const uint8_t* metadata, const SchemaDescriptor* schema); + static std::unique_ptr<RowGroupMetaData> Make(const uint8_t* metadata, + const SchemaDescriptor* schema, const ApplicationVersion* writer_version = NULL); ~RowGroupMetaData(); @@ -83,7 +129,8 @@ class PARQUET_EXPORT RowGroupMetaData { std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) const; private: - explicit RowGroupMetaData(const uint8_t* metadata, const SchemaDescriptor* schema); + explicit RowGroupMetaData(const uint8_t* metadata, const SchemaDescriptor* schema, + const ApplicationVersion* writer_version = NULL); // PIMPL Idiom class RowGroupMetaDataImpl; std::unique_ptr<RowGroupMetaDataImpl> impl_; @@ -93,32 +140,6 @@ class FileMetaDataBuilder; class PARQUET_EXPORT FileMetaData { public: - struct Version { - /// Application that wrote the file. e.g. "IMPALA" - std::string application; - - /// Version of the application that wrote the file, expressed in three parts - /// (<major>.<minor>.<patch>). Unspecified parts default to 0, and extra parts are - /// ignored. e.g.: - /// "1.2.3" => {1, 2, 3} - /// "1.2" => {1, 2, 0} - /// "1.2-cdh5" => {1, 2, 0} - struct { - int major; - int minor; - int patch; - } version; - - Version() {} - explicit Version(const std::string& created_by); - - /// Returns true if version is strictly less than <major>.<minor>.<patch> - bool VersionLt(int major, int minor = 0, int patch = 0) const; - - /// Returns true if version is equal to <major>.<minor>.<patch> - bool VersionEq(int major, int minor, int patch) const; - }; - // API convenience to get a MetaData accessor static std::shared_ptr<FileMetaData> Make( const uint8_t* serialized_metadata, uint32_t* metadata_len); @@ -135,7 +156,7 @@ class PARQUET_EXPORT FileMetaData { int num_schema_elements() const; std::unique_ptr<RowGroupMetaData> RowGroup(int i) const; - const Version& writer_version() const; + const ApplicationVersion& writer_version() const; void WriteTo(OutputStream* dst); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1a6d22b7/src/parquet/file/reader-internal.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc index 4eb40b4..e981e36 100644 --- a/src/parquet/file/reader-internal.cc +++ b/src/parquet/file/reader-internal.cc @@ -183,8 +183,8 @@ std::unique_ptr<PageReader> SerializedRowGroup::GetColumnPageReader(int i) { std::unique_ptr<InputStream> stream; // PARQUET-816 workaround for old files created by older parquet-mr - const FileMetaData::Version& version = file_metadata_->writer_version(); - if (version.application == "parquet-mr" && version.VersionLt(1, 2, 9)) { + const ApplicationVersion& version = file_metadata_->writer_version(); + if (version.VersionLt(ApplicationVersion::PARQUET_816_FIXED_VERSION)) { // The Parquet MR writer had a bug in 1.2.8 and below where it didn't include the // dictionary page header size in total_compressed_size and total_uncompressed_size // (see IMPALA-694). We add padding to compensate.
