Repository: parquet-cpp Updated Branches: refs/heads/master fbdcb21d4 -> 35a48fb26
PARQUET-525: Add test coverage for failure modes in ParseMetaData Author: Wes McKinney <[email protected]> Closes #60 from wesm/PARQUET-525 and squashes the following commits: 04eea0f [Wes McKinney] Test various invalid files checked in reader-internal.cc. Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/35a48fb2 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/35a48fb2 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/35a48fb2 Branch: refs/heads/master Commit: 35a48fb26d76a959857d6a03538fd6c8b7b5733a Parents: fbdcb21 Author: Wes McKinney <[email protected]> Authored: Mon Feb 22 14:08:28 2016 -0800 Committer: Julien Le Dem <[email protected]> Committed: Mon Feb 22 14:08:28 2016 -0800 ---------------------------------------------------------------------- src/parquet/file/file-deserialize-test.cc | 55 ++++++++++++++++++++++++++ src/parquet/file/reader-internal.cc | 9 +---- src/parquet/util/input.cc | 54 ++++++++++++++++++++++--- src/parquet/util/input.h | 37 +++++++++++++++-- 4 files changed, 138 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/35a48fb2/src/parquet/file/file-deserialize-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/file-deserialize-test.cc b/src/parquet/file/file-deserialize-test.cc index 40d599f..3ce6084 100644 --- a/src/parquet/file/file-deserialize-test.cc +++ b/src/parquet/file/file-deserialize-test.cc @@ -236,4 +236,59 @@ TEST_F(TestPageSerde, LZONotSupported) { ASSERT_THROW(InitSerializedPageReader(Compression::LZO), ParquetException); } +// ---------------------------------------------------------------------- +// File structure tests + +class TestParquetFileReader : public ::testing::Test { + public: + void AssertInvalidFileThrows(const std::shared_ptr<Buffer>& buffer) { + std::unique_ptr<BufferReader> reader(new BufferReader(buffer)); + reader_.reset(new ParquetFileReader()); + + ASSERT_THROW(reader_->Open(SerializedFile::Open(std::move(reader))), + ParquetException); + } + + protected: + std::unique_ptr<ParquetFileReader> reader_; +}; + +TEST_F(TestParquetFileReader, InvalidHeader) { + const char* bad_header = "PAR2"; + + auto buffer = std::make_shared<Buffer>( + reinterpret_cast<const uint8_t*>(bad_header), strlen(bad_header)); + AssertInvalidFileThrows(buffer); +} + +TEST_F(TestParquetFileReader, InvalidFooter) { + // File is smaller than FOOTER_SIZE + const char* bad_file = "PAR1PAR"; + auto buffer = std::make_shared<Buffer>( + reinterpret_cast<const uint8_t*>(bad_file), strlen(bad_file)); + AssertInvalidFileThrows(buffer); + + // Magic number incorrect + const char* bad_file2 = "PAR1PAR2"; + buffer = std::make_shared<Buffer>( + reinterpret_cast<const uint8_t*>(bad_file2), strlen(bad_file2)); + AssertInvalidFileThrows(buffer); +} + +TEST_F(TestParquetFileReader, IncompleteMetadata) { + InMemoryOutputStream stream; + + const char* magic = "PAR1"; + + stream.Write(reinterpret_cast<const uint8_t*>(magic), strlen(magic)); + std::vector<uint8_t> bytes(10); + stream.Write(bytes.data(), bytes.size()); + uint32_t metadata_len = 24; + stream.Write(reinterpret_cast<const uint8_t*>(&metadata_len), sizeof(uint32_t)); + stream.Write(reinterpret_cast<const uint8_t*>(magic), strlen(magic)); + + auto buffer = stream.GetBuffer(); + AssertInvalidFileThrows(buffer); +} + } // namespace parquet_cpp http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/35a48fb2/src/parquet/file/reader-internal.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc index 14fcbac..24a8a8a 100644 --- a/src/parquet/file/reader-internal.cc +++ b/src/parquet/file/reader-internal.cc @@ -248,11 +248,8 @@ void SerializedFile::ParseMetaData() { uint8_t footer_buffer[FOOTER_SIZE]; source_->Seek(filesize - FOOTER_SIZE); size_t bytes_read = source_->Read(FOOTER_SIZE, footer_buffer); - - if (bytes_read != FOOTER_SIZE) { - throw ParquetException("Invalid parquet file. Corrupt footer."); - } - if (memcmp(footer_buffer + 4, PARQUET_MAGIC, 4) != 0) { + if (bytes_read != FOOTER_SIZE || + memcmp(footer_buffer + 4, PARQUET_MAGIC, 4) != 0) { throw ParquetException("Invalid parquet file. Corrupt footer."); } @@ -262,7 +259,6 @@ void SerializedFile::ParseMetaData() { throw ParquetException("Invalid parquet file. File is less than " "file metadata size."); } - source_->Seek(metadata_start); std::vector<uint8_t> metadata_buffer(metadata_len); @@ -270,7 +266,6 @@ void SerializedFile::ParseMetaData() { if (bytes_read != metadata_len) { throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } - DeserializeThriftMsg(&metadata_buffer[0], &metadata_len, &metadata_); schema::FlatSchemaConverter converter(&metadata_.schema[0], http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/35a48fb2/src/parquet/util/input.cc ---------------------------------------------------------------------- diff --git a/src/parquet/util/input.cc b/src/parquet/util/input.cc index 1df9060..a238ff6 100644 --- a/src/parquet/util/input.cc +++ b/src/parquet/util/input.cc @@ -44,6 +44,9 @@ void LocalFileSource::Open(const std::string& path) { path_ = path; file_ = fopen(path_.c_str(), "r"); is_open_ = true; + fseek(file_, 0L, SEEK_END); + size_ = Tell(); + Seek(0); } void LocalFileSource::Close() { @@ -58,16 +61,15 @@ void LocalFileSource::CloseFile() { } } -int64_t LocalFileSource::Size() { - fseek(file_, 0L, SEEK_END); - return Tell(); -} - void LocalFileSource::Seek(int64_t pos) { fseek(file_, pos, SEEK_SET); } -int64_t LocalFileSource::Tell() { +int64_t LocalFileSource::Size() const { + return size_; +} + +int64_t LocalFileSource::Tell() const { return ftell(file_); } @@ -87,6 +89,46 @@ std::shared_ptr<Buffer> LocalFileSource::Read(int64_t nbytes) { } // ---------------------------------------------------------------------- +// BufferReader + +BufferReader::BufferReader(const std::shared_ptr<Buffer>& buffer) : + buffer_(buffer), + data_(buffer->data()), + pos_(0) { + size_ = buffer->size(); +} + +int64_t BufferReader::Tell() const { + return pos_; +} + +void BufferReader::Seek(int64_t pos) { + if (pos < 0 || pos >= size_) { + std::stringstream ss; + ss << "Cannot seek to " << pos + << "File is length " << size_; + throw ParquetException(ss.str()); + } + pos_ = pos; +} + +int64_t BufferReader::Size() const { + return size_; +} + +int64_t BufferReader::Read(int64_t nbytes, uint8_t* out) { + ParquetException::NYI("not implemented"); + return 0; +} + +std::shared_ptr<Buffer> BufferReader::Read(int64_t nbytes) { + int64_t bytes_available = std::min(nbytes, size_ - pos_); + auto result = std::make_shared<Buffer>(Head(), bytes_available); + pos_ += bytes_available; + return result; +} + +// ---------------------------------------------------------------------- // InMemoryInputStream InMemoryInputStream::InMemoryInputStream(const std::shared_ptr<Buffer>& buffer) : http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/35a48fb2/src/parquet/util/input.h ---------------------------------------------------------------------- diff --git a/src/parquet/util/input.h b/src/parquet/util/input.h index ac1b4b5..5f2bde3 100644 --- a/src/parquet/util/input.h +++ b/src/parquet/util/input.h @@ -36,9 +36,10 @@ class RandomAccessSource { public: virtual ~RandomAccessSource() {} + virtual int64_t Size() const = 0; + virtual void Close() = 0; - virtual int64_t Size() = 0; - virtual int64_t Tell() = 0; + virtual int64_t Tell() const = 0; virtual void Seek(int64_t pos) = 0; // Returns actual number of bytes read @@ -46,6 +47,9 @@ class RandomAccessSource { virtual std::shared_ptr<Buffer> Read(int64_t nbytes) = 0; std::shared_ptr<Buffer> ReadAt(int64_t pos, int64_t nbytes); + + protected: + int64_t size_; }; @@ -57,8 +61,8 @@ class LocalFileSource : public RandomAccessSource { void Open(const std::string& path); virtual void Close(); - virtual int64_t Size(); - virtual int64_t Tell(); + virtual int64_t Size() const; + virtual int64_t Tell() const; virtual void Seek(int64_t pos); // Returns actual number of bytes read @@ -78,6 +82,31 @@ class LocalFileSource : public RandomAccessSource { }; // ---------------------------------------------------------------------- +// A file-like object that reads from virtual address space + +class BufferReader : public RandomAccessSource { + public: + explicit BufferReader(const std::shared_ptr<Buffer>& buffer); + virtual void Close() {} + virtual int64_t Tell() const; + virtual void Seek(int64_t pos); + virtual int64_t Size() const; + + virtual int64_t Read(int64_t nbytes, uint8_t* out); + + virtual std::shared_ptr<Buffer> Read(int64_t nbytes); + + protected: + const uint8_t* Head() { + return data_ + pos_; + } + + std::shared_ptr<Buffer> buffer_; + const uint8_t* data_; + int64_t pos_; +}; + +// ---------------------------------------------------------------------- // Streaming input interfaces // Interface for the column reader to get the bytes. The interface is a stream
