This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new ea6dc7beb8 GH-48308: [C++][Parquet] Fix potential crash when reading
invalid Parquet data (#48309)
ea6dc7beb8 is described below
commit ea6dc7beb8b68d4b1d607694f12fdfb8013442d1
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Dec 3 11:32:24 2025 +0100
GH-48308: [C++][Parquet] Fix potential crash when reading invalid Parquet
data (#48309)
### Rationale for this change
1. Make value length validation stricter in PLAIN decoding of BYTE_ARRAY
columns.
2. Ensure BinaryBuilder always reserves a non-zero data length, to avoid
encountering a null data pointer when unsafe-appending values.
This should fix https://oss-fuzz.com/testcase-detail/4570087119192064
### Are these changes tested?
Yes, by additional fuzz regression file.
### Are there any user-facing changes?
No.
**This PR contains a "Critical Fix".** (If the changes fix either (a) a
security vulnerability, (b) a bug that caused incorrect or invalid data to be
produced, or (c) a bug that causes a crash (even when the API contract is
upheld), please provide explanation. If not, you can remove this.)
* GitHub Issue: #48308
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/decoder.cc | 89 +++++++++++++++++++++++++---------------------
testing | 2 +-
2 files changed, 50 insertions(+), 41 deletions(-)
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index 431f2d2604..3ce2323d29 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -92,11 +92,7 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType>
{
// If estimated_data_length is provided, it will also reserve the estimated
data length.
Status Prepare(int64_t length, std::optional<int64_t> estimated_data_length
= {}) {
entries_remaining_ = length;
- RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
- if (estimated_data_length.has_value()) {
- RETURN_NOT_OK(builder_->ReserveData(
- std::min<int64_t>(*estimated_data_length,
this->chunk_space_remaining_)));
- }
+ RETURN_NOT_OK(ReserveInitialChunkData(estimated_data_length));
return Status::OK();
}
@@ -110,11 +106,7 @@ struct ArrowBinaryHelper<ByteArrayType,
::arrow::BinaryType> {
// This element would exceed the capacity of a chunk
RETURN_NOT_OK(PushChunk());
// Reserve entries and data in new chunk
- RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
- if (estimated_remaining_data_length.has_value()) {
- RETURN_NOT_OK(builder_->ReserveData(
- std::min<int64_t>(*estimated_remaining_data_length,
chunk_space_remaining_)));
- }
+ RETURN_NOT_OK(ReserveInitialChunkData(estimated_remaining_data_length));
}
chunk_space_remaining_ -= length;
--entries_remaining_;
@@ -147,6 +139,16 @@ struct ArrowBinaryHelper<ByteArrayType,
::arrow::BinaryType> {
return Status::OK();
}
+ Status ReserveInitialChunkData(std::optional<int64_t>
estimated_remaining_data_length) {
+ RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
+ if (estimated_remaining_data_length.has_value()) {
+ int64_t required_capacity =
+ std::min(*estimated_remaining_data_length, chunk_space_remaining_);
+ RETURN_NOT_OK(builder_->ReserveData(required_capacity));
+ }
+ return Status::OK();
+ }
+
bool CanFit(int64_t length) const { return length <= chunk_space_remaining_;
}
Accumulator* acc_;
@@ -754,43 +756,50 @@ class PlainByteArrayDecoder : public
PlainDecoder<ByteArrayType> {
int64_t valid_bits_offset,
typename EncodingTraits<ByteArrayType>::Accumulator*
out,
int* out_values_decoded) {
- // We're going to decode up to `num_values - null_count` PLAIN values,
+ // We're going to decode `num_values - null_count` PLAIN values,
// and each value has a 4-byte length header that doesn't count for the
// Arrow binary data length.
- int64_t estimated_data_length =
- std::max<int64_t>(0, len_ - 4 * (num_values - null_count));
+ int64_t estimated_data_length = len_ - 4 * (num_values - null_count);
+ if (ARROW_PREDICT_FALSE(estimated_data_length < 0)) {
+ return Status::Invalid("Invalid or truncated PLAIN-encoded BYTE_ARRAY
data");
+ }
auto visit_binary_helper = [&](auto* helper) {
int values_decoded = 0;
- RETURN_NOT_OK(VisitBitRuns(
- valid_bits, valid_bits_offset, num_values,
- [&](int64_t position, int64_t run_length, bool is_valid) {
- if (is_valid) {
- for (int64_t i = 0; i < run_length; ++i) {
- if (ARROW_PREDICT_FALSE(len_ < 4)) {
- return Status::Invalid(
- "Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
- }
- auto value_len = SafeLoadAs<int32_t>(data_);
- if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > len_ -
4)) {
- return Status::Invalid(
- "Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
- }
- RETURN_NOT_OK(
- helper->AppendValue(data_ + 4, value_len,
estimated_data_length));
- auto increment = value_len + 4;
- data_ += increment;
- len_ -= increment;
- estimated_data_length -= value_len;
- DCHECK_GE(estimated_data_length, 0);
- }
- values_decoded += static_cast<int>(run_length);
- return Status::OK();
- } else {
- return helper->AppendNulls(run_length);
+ auto visit_run = [&](int64_t position, int64_t run_length, bool
is_valid) {
+ if (is_valid) {
+ for (int64_t i = 0; i < run_length; ++i) {
+ // We ensure `len_` is sufficient thanks to:
+ // 1. the initial `estimated_data_length` check above,
+ // 2. the running `value_len > estimated_data_length` check
below.
+ // This precondition follows from those two checks.
+ DCHECK_GE(len_, 4);
+ auto value_len = SafeLoadAs<int32_t>(data_);
+ // This check also ensures that `value_len <= len_ - 4` due to the
way
+ // `estimated_data_length` is computed.
+ if (ARROW_PREDICT_FALSE(value_len < 0 || value_len >
estimated_data_length)) {
+ return Status::Invalid(
+ "Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
}
- }));
+ RETURN_NOT_OK(
+ helper->AppendValue(data_ + 4, value_len,
estimated_data_length));
+ auto increment = value_len + 4;
+ data_ += increment;
+ len_ -= increment;
+ estimated_data_length -= value_len;
+ DCHECK_GE(estimated_data_length, 0);
+ }
+ values_decoded += static_cast<int>(run_length);
+ DCHECK_LE(values_decoded, num_values);
+ return Status::OK();
+ } else {
+ return helper->AppendNulls(run_length);
+ }
+ };
+
+ RETURN_NOT_OK(
+ VisitBitRuns(valid_bits, valid_bits_offset, num_values,
std::move(visit_run)));
num_values_ -= values_decoded;
*out_values_decoded = values_decoded;
diff --git a/testing b/testing
index 047d391459..da559b5412 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 047d3914590d8379b5da19c4f5b0d1869a8ecdb3
+Subproject commit da559b5412551c490a97c67f6779fcefd4352aff