This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch maint-16.x.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 6cfebb947e9c91b3a823e5dc7a1672d1b0a0141e
Author: mwish <[email protected]>
AuthorDate: Tue May 7 23:56:02 2024 +0800

    GH-41562: [C++][Parquet] Decoding: Fix num_value handling in 
ByteStreamSplitDecoder (#41565)
    
    
    
    ### Rationale for this change
    
    This problem is raised from  https://github.com/apache/arrow/pull/40094 . 
Original bug fixed here: https://github.com/apache/arrow/pull/34140 , but this 
is corrupt in https://github.com/apache/arrow/pull/40094 .
    
    ### What changes are included in this PR?
    
    Refine checking
    
    ### Are these changes tested?
    
    * [x] Will add
    
    ### Are there any user-facing changes?
    
    Bugfix
    
    * GitHub Issue: #41562
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/encoding.cc      | 22 +++++++++++++++++-----
 cpp/src/parquet/encoding.h       |  5 +++++
 cpp/src/parquet/encoding_test.cc |  4 ++--
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 3da5c64ace..05221568c8 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -3694,12 +3694,24 @@ class ByteStreamSplitDecoderBase : public DecoderImpl,
   ByteStreamSplitDecoderBase(const ColumnDescriptor* descr, int byte_width)
       : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT), 
byte_width_(byte_width) {}
 
-  void SetData(int num_values, const uint8_t* data, int len) override {
-    if (static_cast<int64_t>(num_values) * byte_width_ != len) {
-      throw ParquetException("Data size (" + std::to_string(len) +
-                             ") does not match number of values in 
BYTE_STREAM_SPLIT (" +
-                             std::to_string(num_values) + ")");
+  void SetData(int num_values, const uint8_t* data, int len) final {
+    // Check that the data size is consistent with the number of values
+    // The spec requires that the data size is a multiple of the number of 
values,
+    // see: https://github.com/apache/parquet-format/pull/192 .
+    // GH-41562: passed in `num_values` may include nulls, so we need to check 
and
+    // adjust the number of values.
+    if (static_cast<int64_t>(num_values) * byte_width_ < len) {
+      throw ParquetException(
+          "Data size (" + std::to_string(len) +
+          ") is too small for the number of values in in BYTE_STREAM_SPLIT (" +
+          std::to_string(num_values) + ")");
+    }
+    if (len % byte_width_ != 0) {
+      throw ParquetException("ByteStreamSplit data size " + 
std::to_string(len) +
+                             " not aligned with type " + 
TypeToString(DType::type_num) +
+                             " and byte_width: " + 
std::to_string(byte_width_));
     }
+    num_values = len / byte_width_;
     DecoderImpl::SetData(num_values, data, len);
     stride_ = num_values_;
   }
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 6020091895..493c4044dd 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -255,6 +255,11 @@ class Decoder {
 
   // Sets the data for a new page. This will be called multiple times on the 
same
   // decoder and should reset all internal state.
+  //
+  // `num_values` comes from the data page header, and may be greater than the 
number of
+  // physical values in the data buffer if there are some omitted (null) 
values.
+  // `len`, on the other hand, is the size in bytes of the data buffer and
+  // directly relates to the number of physical values.
   virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
 
   // Returns the number of values left (for the last call to SetData()). This 
is
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index b91fcb0839..3c20b917f6 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -1383,7 +1383,7 @@ class TestByteStreamSplitEncoding : public 
TestEncodingBase<Type> {
     encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset);
     encode_buffer_ = encoder->FlushValues();
     ASSERT_EQ(encode_buffer_->size(), physical_byte_width() * (num_values_ - 
null_count));
-    decoder->SetData(num_values_ - null_count, encode_buffer_->data(),
+    decoder->SetData(num_values_, encode_buffer_->data(),
                      static_cast<int>(encode_buffer_->size()));
     auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, 
null_count,
                                                 valid_bits, valid_bits_offset);
@@ -1717,7 +1717,7 @@ class TestDeltaBitPackEncoding : public 
TestEncodingBase<Type> {
     for (size_t i = 0; i < kNumRoundTrips; ++i) {
       encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset);
       encode_buffer_ = encoder->FlushValues();
-      decoder->SetData(num_values_ - null_count, encode_buffer_->data(),
+      decoder->SetData(num_values_, encode_buffer_->data(),
                        static_cast<int>(encode_buffer_->size()));
       auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, 
null_count,
                                                   valid_bits, 
valid_bits_offset);

Reply via email to