This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 4712daba56 GH-38326: [C++][Parquet] check the decompressed page size
same as size in page header (#38327)
4712daba56 is described below
commit 4712daba56a282137bc72e05f10a3db152ee2534
Author: mwish <[email protected]>
AuthorDate: Fri Oct 20 00:29:52 2023 +0800
GH-38326: [C++][Parquet] check the decompressed page size same as size in
page header (#38327)
### Rationale for this change
As mentioned in issue, currently we only decompress the page without
checking the decompress size.
This patch add a checkings.
### What changes are included in this PR?
Throw exception when size not matches in
`SerializedPageReader::DecompressIfNeeded`
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Non-conforming files may throw an exception while they would silently
return invalid results before.
* Closes: #38326
Authored-by: mwish <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/column_reader.cc | 15 +++++++++---
cpp/src/parquet/column_reader_test.cc | 4 +--
cpp/src/parquet/file_deserialize_test.cc | 42 ++++++++++++++++++++++++++++++++
3 files changed, 55 insertions(+), 6 deletions(-)
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index e09369effb..ecc48811e4 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -594,10 +594,17 @@ std::shared_ptr<Buffer>
SerializedPageReader::DecompressIfNeeded(
}
// Decompress the values
- PARQUET_THROW_NOT_OK(decompressor_->Decompress(
- compressed_len - levels_byte_len, page_buffer->data() + levels_byte_len,
- uncompressed_len - levels_byte_len,
- decompression_buffer_->mutable_data() + levels_byte_len));
+ PARQUET_ASSIGN_OR_THROW(
+ auto decompressed_len,
+ decompressor_->Decompress(compressed_len - levels_byte_len,
+ page_buffer->data() + levels_byte_len,
+ uncompressed_len - levels_byte_len,
+ decompression_buffer_->mutable_data() +
levels_byte_len));
+ if (decompressed_len != uncompressed_len - levels_byte_len) {
+ throw ParquetException("Page didn't decompress to expected size, expected:
" +
+ std::to_string(uncompressed_len - levels_byte_len) +
+ ", but got:" + std::to_string(decompressed_len));
+ }
return decompression_buffer_;
}
diff --git a/cpp/src/parquet/column_reader_test.cc
b/cpp/src/parquet/column_reader_test.cc
index b3b75a7bb5..bed7e06786 100644
--- a/cpp/src/parquet/column_reader_test.cc
+++ b/cpp/src/parquet/column_reader_test.cc
@@ -1587,8 +1587,8 @@ class ByteArrayRecordReaderTest : public
::testing::TestWithParam<bool> {
};
// Tests reading and skipping a ByteArray field.
-// The binary readers only differ in DeocdeDense and DecodeSpaced functions, so
-// testing optional is sufficient in excercising those code paths.
+// The binary readers only differ in DecodeDense and DecodeSpaced functions, so
+// testing optional is sufficient in exercising those code paths.
TEST_P(ByteArrayRecordReaderTest, ReadAndSkipOptional) {
MakeRecordReader(/*levels_per_page=*/90, /*num_pages=*/1);
diff --git a/cpp/src/parquet/file_deserialize_test.cc
b/cpp/src/parquet/file_deserialize_test.cc
index b2a918915e..4377e714a2 100644
--- a/cpp/src/parquet/file_deserialize_test.cc
+++ b/cpp/src/parquet/file_deserialize_test.cc
@@ -879,6 +879,48 @@ TEST_F(TestPageSerde, DataPageV2CrcCheckNonExistent) {
/* write_data_page_v2 */ true);
}
+TEST_F(TestPageSerde, BadCompressedPageSize) {
+ // GH-38326: an exception should be raised if a compressed data page
+ // decompresses to a smaller size than declared in the data page header.
+ auto codec_types = GetSupportedCodecTypes();
+ const int data_page_bytes = 8192;
+ const int32_t num_rows = 32; // dummy value
+ data_page_header_.num_values = num_rows;
+ std::vector<uint8_t> faux_data;
+ // A well-compressible piece of data
+ faux_data.resize(data_page_bytes, 1);
+ for (auto codec_type : codec_types) {
+ auto codec = GetCodec(codec_type);
+ std::vector<uint8_t> buffer;
+ const uint8_t* data = faux_data.data();
+ int data_size = static_cast<int>(faux_data.size());
+
+ int64_t max_compressed_size = codec->MaxCompressedLen(data_size, data);
+ buffer.resize(max_compressed_size);
+
+ int64_t actual_size;
+ ASSERT_OK_AND_ASSIGN(
+ actual_size, codec->Compress(data_size, data, max_compressed_size,
&buffer[0]));
+ // Write a data page header declaring a larger decompressed size than
actual
+ ASSERT_NO_FATAL_FAILURE(WriteDataPageHeader(data_page_bytes, data_size + 1,
+
static_cast<int32_t>(actual_size)));
+ ASSERT_OK(out_stream_->Write(buffer.data(), actual_size));
+ InitSerializedPageReader(num_rows, codec_type);
+
+ EXPECT_THROW_THAT(
+ [&]() { page_reader_->NextPage(); }, ParquetException,
+ ::testing::AnyOf(
+ ::testing::Property(
+ &ParquetException::what,
+ ::testing::HasSubstr("Page didn't decompress to expected
size")),
+ // Some decompressor, like zstd, might be able to detect the error
+ // before checking the page size.
+ ::testing::Property(&ParquetException::what,
+ ::testing::HasSubstr("IOError"))));
+ ResetStream();
+ }
+}
+
// ----------------------------------------------------------------------
// File structure tests