Repository: parquet-cpp Updated Branches: refs/heads/master e93a085d7 -> b36c9ac00
PARQUET-978: [C++] Minimizing footer reads for small(ish) metadata I realized we don't have a full end to end `writer-reader-test` for parquet. Would have been easier to test this PR. Does it make sense adding one? Author: Deepak Majeti <[email protected]> Closes #341 from majetideepak/PARQUET-978 and squashes the following commits: 104df0c [Deepak Majeti] clang format dbb28be [Deepak Majeti] check metadata size in reader-test 2b13715 [Deepak Majeti] Fix comment in writer-internal.cc 850b61c [Deepak Majeti] PARQUET-978 Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/b36c9ac0 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/b36c9ac0 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/b36c9ac0 Branch: refs/heads/master Commit: b36c9ac000ae64a27c51aecb48da9b270c83b2e8 Parents: e93a085 Author: Deepak Majeti <[email protected]> Authored: Sat May 27 16:32:10 2017 +0200 Committer: Uwe L. Korn <[email protected]> Committed: Sat May 27 16:32:10 2017 +0200 ---------------------------------------------------------------------- src/parquet/file/reader-internal.cc | 29 +++++++++++++++++++++-------- src/parquet/file/writer-internal.cc | 1 - src/parquet/reader-test.cc | 2 ++ 3 files changed, 23 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b36c9ac0/src/parquet/file/reader-internal.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc index c05bb12..3543759 100644 --- a/src/parquet/file/reader-internal.cc +++ b/src/parquet/file/reader-internal.cc @@ -202,6 +202,8 @@ std::unique_ptr<PageReader> SerializedRowGroup::GetColumnPageReader(int i) { // ---------------------------------------------------------------------- // SerializedFile: Parquet on-disk layout +// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file +static constexpr int64_t DEFAULT_FOOTER_READ_SIZE = 64 * 1024; static constexpr uint32_t FOOTER_SIZE = 8; static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; @@ -255,15 +257,19 @@ void SerializedFile::ParseMetaData() { throw ParquetException("Corrupted file, smaller than file footer"); } - uint8_t footer_buffer[FOOTER_SIZE]; + uint8_t footer_buffer[DEFAULT_FOOTER_READ_SIZE]; + int64_t footer_read_size = std::min(file_size, DEFAULT_FOOTER_READ_SIZE); int64_t bytes_read = - source_->ReadAt(file_size - FOOTER_SIZE, FOOTER_SIZE, footer_buffer); + source_->ReadAt(file_size - footer_read_size, footer_read_size, footer_buffer); - if (bytes_read != FOOTER_SIZE || memcmp(footer_buffer + 4, PARQUET_MAGIC, 4) != 0) { + // Check if all bytes are read. Check if last 4 bytes read have the magic bits + if (bytes_read != footer_read_size || + memcmp(footer_buffer + footer_read_size - 4, PARQUET_MAGIC, 4) != 0) { throw ParquetException("Invalid parquet file. Corrupt footer."); } - uint32_t metadata_len = *reinterpret_cast<uint32_t*>(footer_buffer); + uint32_t metadata_len = + *reinterpret_cast<uint32_t*>(footer_buffer + footer_read_size - FOOTER_SIZE); int64_t metadata_start = file_size - FOOTER_SIZE - metadata_len; if (FOOTER_SIZE + metadata_len > file_size) { throw ParquetException( @@ -273,10 +279,17 @@ void SerializedFile::ParseMetaData() { std::shared_ptr<PoolBuffer> metadata_buffer = AllocateBuffer(properties_.memory_pool(), metadata_len); - bytes_read = - source_->ReadAt(metadata_start, metadata_len, metadata_buffer->mutable_data()); - if (bytes_read != metadata_len) { - throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (metadata_len + FOOTER_SIZE)) { + memcpy(metadata_buffer->mutable_data(), + footer_buffer + (footer_read_size - metadata_len - FOOTER_SIZE), metadata_len); + } else { + bytes_read = + source_->ReadAt(metadata_start, metadata_len, metadata_buffer->mutable_data()); + if (bytes_read != metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } } file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b36c9ac0/src/parquet/file/writer-internal.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/writer-internal.cc b/src/parquet/file/writer-internal.cc index b69e87e..633498c 100644 --- a/src/parquet/file/writer-internal.cc +++ b/src/parquet/file/writer-internal.cc @@ -62,7 +62,6 @@ static format::Statistics ToThrift(const EncodedStatistics& row_group_statistics void SerializedPageWriter::Close(bool has_dictionary, bool fallback) { // index_page_offset = 0 since they are not supported - // TODO: Remove default fallback = 'false' when implemented metadata_->Finish(num_values_, dictionary_page_offset_, 0, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b36c9ac0/src/parquet/reader-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc index 71f982b..e8a6813 100644 --- a/src/parquet/reader-test.cc +++ b/src/parquet/reader-test.cc @@ -83,6 +83,8 @@ TEST_F(TestAllTypesPlain, TestBatchRead) { ASSERT_EQ(8, reader_->metadata()->num_rows()); // This file only has 1 row group ASSERT_EQ(1, reader_->metadata()->num_row_groups()); + // Size of the metadata is 730 bytes + ASSERT_EQ(730, reader_->metadata()->size()); // This row group must have 8 rows ASSERT_EQ(8, group->metadata()->num_rows());
