(arrow) branch main updated: GH-48308: [C++][Parquet] Fix potential crash when reading invalid Parquet data (#48309)

apitrou Wed, 03 Dec 2025 02:33:18 -0800

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new ea6dc7beb8 GH-48308: [C++][Parquet] Fix potential crash when reading 
invalid Parquet data (#48309)
ea6dc7beb8 is described below

commit ea6dc7beb8b68d4b1d607694f12fdfb8013442d1
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Dec 3 11:32:24 2025 +0100

    GH-48308: [C++][Parquet] Fix potential crash when reading invalid Parquet 
data (#48309)
    
    ### Rationale for this change
    
    1. Make value length validation stricter in PLAIN decoding of BYTE_ARRAY 
columns.
    2. Ensure BinaryBuilder always reserves a non-zero data length, to avoid 
encountering a null data pointer when unsafe-appending values.
    
    This should fix https://oss-fuzz.com/testcase-detail/4570087119192064
    
    ### Are these changes tested?
    
    Yes, by additional fuzz regression file.
    
    ### Are there any user-facing changes?
    
    No.
    
    **This PR contains a "Critical Fix".** (If the changes fix either (a) a 
security vulnerability, (b) a bug that caused incorrect or invalid data to be 
produced, or (c) a bug that causes a crash (even when the API contract is 
upheld), please provide explanation. If not, you can remove this.)
    
    * GitHub Issue: #48308
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/decoder.cc | 89 +++++++++++++++++++++++++---------------------
 testing                    |  2 +-
 2 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index 431f2d2604..3ce2323d29 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -92,11 +92,7 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> 
{
   // If estimated_data_length is provided, it will also reserve the estimated 
data length.
   Status Prepare(int64_t length, std::optional<int64_t> estimated_data_length 
= {}) {
     entries_remaining_ = length;
-    RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
-    if (estimated_data_length.has_value()) {
-      RETURN_NOT_OK(builder_->ReserveData(
-          std::min<int64_t>(*estimated_data_length, 
this->chunk_space_remaining_)));
-    }
+    RETURN_NOT_OK(ReserveInitialChunkData(estimated_data_length));
     return Status::OK();
   }
 
@@ -110,11 +106,7 @@ struct ArrowBinaryHelper<ByteArrayType, 
::arrow::BinaryType> {
       // This element would exceed the capacity of a chunk
       RETURN_NOT_OK(PushChunk());
       // Reserve entries and data in new chunk
-      RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
-      if (estimated_remaining_data_length.has_value()) {
-        RETURN_NOT_OK(builder_->ReserveData(
-            std::min<int64_t>(*estimated_remaining_data_length, 
chunk_space_remaining_)));
-      }
+      RETURN_NOT_OK(ReserveInitialChunkData(estimated_remaining_data_length));
     }
     chunk_space_remaining_ -= length;
     --entries_remaining_;
@@ -147,6 +139,16 @@ struct ArrowBinaryHelper<ByteArrayType, 
::arrow::BinaryType> {
     return Status::OK();
   }
 
+  Status ReserveInitialChunkData(std::optional<int64_t> 
estimated_remaining_data_length) {
+    RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
+    if (estimated_remaining_data_length.has_value()) {
+      int64_t required_capacity =
+          std::min(*estimated_remaining_data_length, chunk_space_remaining_);
+      RETURN_NOT_OK(builder_->ReserveData(required_capacity));
+    }
+    return Status::OK();
+  }
+
   bool CanFit(int64_t length) const { return length <= chunk_space_remaining_; 
}
 
   Accumulator* acc_;
@@ -754,43 +756,50 @@ class PlainByteArrayDecoder : public 
PlainDecoder<ByteArrayType> {
                           int64_t valid_bits_offset,
                           typename EncodingTraits<ByteArrayType>::Accumulator* 
out,
                           int* out_values_decoded) {
-    // We're going to decode up to `num_values - null_count` PLAIN values,
+    // We're going to decode `num_values - null_count` PLAIN values,
     // and each value has a 4-byte length header that doesn't count for the
     // Arrow binary data length.
-    int64_t estimated_data_length =
-        std::max<int64_t>(0, len_ - 4 * (num_values - null_count));
+    int64_t estimated_data_length = len_ - 4 * (num_values - null_count);
+    if (ARROW_PREDICT_FALSE(estimated_data_length < 0)) {
+      return Status::Invalid("Invalid or truncated PLAIN-encoded BYTE_ARRAY 
data");
+    }
 
     auto visit_binary_helper = [&](auto* helper) {
       int values_decoded = 0;
 
-      RETURN_NOT_OK(VisitBitRuns(
-          valid_bits, valid_bits_offset, num_values,
-          [&](int64_t position, int64_t run_length, bool is_valid) {
-            if (is_valid) {
-              for (int64_t i = 0; i < run_length; ++i) {
-                if (ARROW_PREDICT_FALSE(len_ < 4)) {
-                  return Status::Invalid(
-                      "Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
-                }
-                auto value_len = SafeLoadAs<int32_t>(data_);
-                if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > len_ - 
4)) {
-                  return Status::Invalid(
-                      "Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
-                }
-                RETURN_NOT_OK(
-                    helper->AppendValue(data_ + 4, value_len, 
estimated_data_length));
-                auto increment = value_len + 4;
-                data_ += increment;
-                len_ -= increment;
-                estimated_data_length -= value_len;
-                DCHECK_GE(estimated_data_length, 0);
-              }
-              values_decoded += static_cast<int>(run_length);
-              return Status::OK();
-            } else {
-              return helper->AppendNulls(run_length);
+      auto visit_run = [&](int64_t position, int64_t run_length, bool 
is_valid) {
+        if (is_valid) {
+          for (int64_t i = 0; i < run_length; ++i) {
+            // We ensure `len_` is sufficient thanks to:
+            //   1. the initial `estimated_data_length` check above,
+            //   2. the running `value_len > estimated_data_length` check 
below.
+            // This precondition follows from those two checks.
+            DCHECK_GE(len_, 4);
+            auto value_len = SafeLoadAs<int32_t>(data_);
+            // This check also ensures that `value_len <= len_ - 4` due to the 
way
+            // `estimated_data_length` is computed.
+            if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > 
estimated_data_length)) {
+              return Status::Invalid(
+                  "Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
             }
-          }));
+            RETURN_NOT_OK(
+                helper->AppendValue(data_ + 4, value_len, 
estimated_data_length));
+            auto increment = value_len + 4;
+            data_ += increment;
+            len_ -= increment;
+            estimated_data_length -= value_len;
+            DCHECK_GE(estimated_data_length, 0);
+          }
+          values_decoded += static_cast<int>(run_length);
+          DCHECK_LE(values_decoded, num_values);
+          return Status::OK();
+        } else {
+          return helper->AppendNulls(run_length);
+        }
+      };
+
+      RETURN_NOT_OK(
+          VisitBitRuns(valid_bits, valid_bits_offset, num_values, 
std::move(visit_run)));
 
       num_values_ -= values_decoded;
       *out_values_decoded = values_decoded;
diff --git a/testing b/testing
index 047d391459..da559b5412 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 047d3914590d8379b5da19c4f5b0d1869a8ecdb3
+Subproject commit da559b5412551c490a97c67f6779fcefd4352aff

(arrow) branch main updated: GH-48308: [C++][Parquet] Fix potential crash when reading invalid Parquet data (#48309)

Reply via email to