pitrou commented on code in PR #48309:
URL: https://github.com/apache/arrow/pull/48309#discussion_r2582171832
##########
cpp/src/parquet/decoder.cc:
##########
@@ -754,43 +760,47 @@ class PlainByteArrayDecoder : public
PlainDecoder<ByteArrayType> {
int64_t valid_bits_offset,
typename EncodingTraits<ByteArrayType>::Accumulator*
out,
int* out_values_decoded) {
- // We're going to decode up to `num_values - null_count` PLAIN values,
+ // We're going to decode `num_values - null_count` PLAIN values,
// and each value has a 4-byte length header that doesn't count for the
// Arrow binary data length.
- int64_t estimated_data_length =
- std::max<int64_t>(0, len_ - 4 * (num_values - null_count));
+ int64_t estimated_data_length = len_ - 4 * (num_values - null_count);
+ if (ARROW_PREDICT_FALSE(estimated_data_length < 0)) {
+ return Status::Invalid("Invalid or truncated PLAIN-encoded BYTE_ARRAY
data");
+ }
auto visit_binary_helper = [&](auto* helper) {
int values_decoded = 0;
- RETURN_NOT_OK(VisitBitRuns(
- valid_bits, valid_bits_offset, num_values,
- [&](int64_t position, int64_t run_length, bool is_valid) {
- if (is_valid) {
- for (int64_t i = 0; i < run_length; ++i) {
- if (ARROW_PREDICT_FALSE(len_ < 4)) {
- return Status::Invalid(
- "Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
- }
- auto value_len = SafeLoadAs<int32_t>(data_);
- if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > len_ -
4)) {
- return Status::Invalid(
- "Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
- }
- RETURN_NOT_OK(
- helper->AppendValue(data_ + 4, value_len,
estimated_data_length));
- auto increment = value_len + 4;
- data_ += increment;
- len_ -= increment;
- estimated_data_length -= value_len;
- DCHECK_GE(estimated_data_length, 0);
- }
- values_decoded += static_cast<int>(run_length);
- return Status::OK();
- } else {
- return helper->AppendNulls(run_length);
+ auto visit_run = [&](int64_t position, int64_t run_length, bool
is_valid) {
+ if (is_valid) {
+ for (int64_t i = 0; i < run_length; ++i) {
+ // We ensure `len_` is sufficient thanks to:
+ // 1. the initial `estimated_data_length` check above,
+ // 2. the running `value_len > estimated_data_length` check
below.
+ // This precondition follows from those two checks.
+ DCHECK_GE(len_, 4);
+ auto value_len = SafeLoadAs<int32_t>(data_);
+ if (ARROW_PREDICT_FALSE(value_len < 0 || value_len >
estimated_data_length)) {
Review Comment:
This is the main logical change in this code block: `value_len > len_ - 4`
is replaced with the stricter `value_len > estimated_data_length`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]