pitrou commented on code in PR #46532: URL: https://github.com/apache/arrow/pull/46532#discussion_r2108326403
########## cpp/src/parquet/decoder.cc: ########## @@ -654,43 +701,41 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>, int64_t valid_bits_offset, typename EncodingTraits<ByteArrayType>::Accumulator* out, int* out_values_decoded) { - ArrowBinaryHelper<ByteArrayType> helper(out, num_values); - int values_decoded = 0; - - RETURN_NOT_OK(helper.Prepare(len_)); - - int i = 0; - RETURN_NOT_OK(VisitNullBitmapInline( - valid_bits, valid_bits_offset, num_values, null_count, - [&]() { - if (ARROW_PREDICT_FALSE(len_ < 4)) { - ParquetException::EofException(); - } - auto value_len = SafeLoadAs<int32_t>(data_); - if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { - return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); - } - auto increment = value_len + 4; - if (ARROW_PREDICT_FALSE(len_ < increment)) { - ParquetException::EofException(); - } - RETURN_NOT_OK(helper.PrepareNextInput(value_len, len_)); - helper.UnsafeAppend(data_ + 4, value_len); - data_ += increment; - len_ -= increment; - ++values_decoded; - ++i; - return Status::OK(); - }, - [&]() { - helper.UnsafeAppendNull(); - ++i; - return Status::OK(); - })); + auto visit_binary_helper = [&](auto* helper) { + int values_decoded = 0; + + RETURN_NOT_OK(VisitNullBitmapInline( + valid_bits, valid_bits_offset, num_values, null_count, + [&]() { + if (ARROW_PREDICT_FALSE(len_ < 4)) { + return Status::Invalid( + "Invalid or truncated PLAIN-encoded BYTE_ARRAY data"); + } + auto value_len = SafeLoadAs<int32_t>(data_); + if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > len_ - 4)) { + return Status::Invalid( + "Invalid or truncated PLAIN-encoded BYTE_ARRAY data"); + } + RETURN_NOT_OK(helper->AppendValue(data_ + 4, value_len, + /*estimated_remaining_data_length=*/len_)); Review Comment: Yes, the API could look weird. You have to take a look at the `AppendValue` definition so that it makes sense: the `len_` is only used when a new chunk is started, to reserve the data on the new builder. ########## cpp/src/parquet/decoder.cc: ########## @@ -654,43 +701,41 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>, int64_t valid_bits_offset, typename EncodingTraits<ByteArrayType>::Accumulator* out, int* out_values_decoded) { - ArrowBinaryHelper<ByteArrayType> helper(out, num_values); - int values_decoded = 0; - - RETURN_NOT_OK(helper.Prepare(len_)); - - int i = 0; - RETURN_NOT_OK(VisitNullBitmapInline( - valid_bits, valid_bits_offset, num_values, null_count, - [&]() { - if (ARROW_PREDICT_FALSE(len_ < 4)) { - ParquetException::EofException(); - } - auto value_len = SafeLoadAs<int32_t>(data_); - if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { - return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); - } - auto increment = value_len + 4; - if (ARROW_PREDICT_FALSE(len_ < increment)) { - ParquetException::EofException(); - } - RETURN_NOT_OK(helper.PrepareNextInput(value_len, len_)); - helper.UnsafeAppend(data_ + 4, value_len); - data_ += increment; - len_ -= increment; - ++values_decoded; - ++i; - return Status::OK(); - }, - [&]() { - helper.UnsafeAppendNull(); - ++i; - return Status::OK(); - })); + auto visit_binary_helper = [&](auto* helper) { + int values_decoded = 0; + + RETURN_NOT_OK(VisitNullBitmapInline( + valid_bits, valid_bits_offset, num_values, null_count, + [&]() { + if (ARROW_PREDICT_FALSE(len_ < 4)) { + return Status::Invalid( + "Invalid or truncated PLAIN-encoded BYTE_ARRAY data"); + } + auto value_len = SafeLoadAs<int32_t>(data_); + if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > len_ - 4)) { + return Status::Invalid( + "Invalid or truncated PLAIN-encoded BYTE_ARRAY data"); + } + RETURN_NOT_OK(helper->AppendValue(data_ + 4, value_len, + /*estimated_remaining_data_length=*/len_)); Review Comment: Yes, the API can look weird. You have to take a look at the `AppendValue` definition so that it makes sense: the `len_` is only used when a new chunk is started, to reserve the data on the new builder. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org