This is an automated email from the ASF dual-hosted git repository.

wjones127 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new c31fb46544 GH-15173: [C++][Parquet] Fixing ByteStreamSplit Standard 
broken (#34140)
c31fb46544 is described below

commit c31fb46544b9c8372e799138bad9223162169473
Author: mwish <[email protected]>
AuthorDate: Sat Feb 18 00:08:19 2023 +0800

    GH-15173: [C++][Parquet] Fixing ByteStreamSplit Standard broken (#34140)
    
    
    
    ### Rationale for this change
    
    This patch fixes bugs in BYTE_STREAM_SPLIT. It enforce the parquet 
ByteStreamSplit not to use padding, and force using `sizeof(data) / sizeof(T)` 
as num-of-values.
    
    ### What changes are included in this PR?
    
    1. Update num-of-values in ByteStreamSplit
    2. Add testing
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    No
    
    * Closes: #15173
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: Will Jones <[email protected]>
---
 cpp/src/parquet/encoding.cc      | 15 ++++++++++----
 cpp/src/parquet/encoding_test.cc | 45 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 3ed6d2f0b9..a47dc6b4c0 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2937,11 +2937,18 @@ 
ByteStreamSplitDecoder<DType>::ByteStreamSplitDecoder(const ColumnDescriptor* de
 template <typename DType>
 void ByteStreamSplitDecoder<DType>::SetData(int num_values, const uint8_t* 
data,
                                             int len) {
-  DecoderImpl::SetData(num_values, data, len);
-  if (num_values * static_cast<int64_t>(sizeof(T)) > len) {
-    throw ParquetException("Data size too small for number of values 
(corrupted file?)");
+  if (num_values * static_cast<int64_t>(sizeof(T)) < len) {
+    throw ParquetException(
+        "Data size too large for number of values (padding in byte stream 
split data "
+        "page?)");
+  }
+  if (len % sizeof(T) != 0) {
+    throw ParquetException("ByteStreamSplit data size " + std::to_string(len) +
+                           " not aligned with type " + 
TypeToString(DType::type_num));
   }
-  num_values_in_buffer_ = num_values;
+  num_values = len / sizeof(T);
+  DecoderImpl::SetData(num_values, data, len);
+  num_values_in_buffer_ = num_values_;
 }
 
 template <typename DType>
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index ac3ea1118b..1500ba3c80 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -1125,6 +1125,29 @@ class TestByteStreamSplitEncoding : public 
TestEncodingBase<Type> {
     }
   }
 
+  void CheckRoundtripSpaced(const uint8_t* valid_bits,
+                            int64_t valid_bits_offset) override {
+    auto encoder =
+        MakeTypedEncoder<Type>(Encoding::BYTE_STREAM_SPLIT, false, 
descr_.get());
+    auto decoder = MakeTypedDecoder<Type>(Encoding::BYTE_STREAM_SPLIT, 
descr_.get());
+    int null_count = 0;
+    for (auto i = 0; i < num_values_; i++) {
+      if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) {
+        null_count++;
+      }
+    }
+
+    encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset);
+    encode_buffer_ = encoder->FlushValues();
+    decoder->SetData(num_values_, encode_buffer_->data(),
+                     static_cast<int>(encode_buffer_->size()));
+    auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, 
null_count,
+                                                valid_bits, valid_bits_offset);
+    ASSERT_EQ(num_values_, values_decoded);
+    ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced<c_type>(decode_buf_, draws_, 
num_values_,
+                                                        valid_bits, 
valid_bits_offset));
+  }
+
   void CheckDecode();
   void CheckEncode();
 
@@ -1247,6 +1270,28 @@ TYPED_TEST(TestByteStreamSplitEncoding, 
RoundTripSingleElement) {
   ASSERT_NO_FATAL_FAILURE(this->Execute(1, 1));
 }
 
+TYPED_TEST(TestByteStreamSplitEncoding, RoundTripSpace) {
+  ASSERT_NO_FATAL_FAILURE(this->Execute(10000, 1));
+
+  // Spaced test with different sizes and offset to guarantee SIMD 
implementation
+  constexpr int kAvx512Size = 64;         // sizeof(__m512i) for Avx512
+  constexpr int kSimdSize = kAvx512Size;  // Current the max is Avx512
+  constexpr int kMultiSimdSize = kSimdSize * 33;
+
+  for (auto null_prob : {0.001, 0.1, 0.5, 0.9, 0.999}) {
+    // Test with both size and offset up to 3 Simd block
+    for (auto i = 1; i < kSimdSize * 3; i++) {
+      ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(i, 1, 0, null_prob));
+      ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(i, 1, i + 1, null_prob));
+    }
+    // Large block and offset
+    ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(kMultiSimdSize, 1, 0, 
null_prob));
+    ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(kMultiSimdSize + 33, 1, 0, 
null_prob));
+    ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(kMultiSimdSize, 1, 33, 
null_prob));
+    ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(kMultiSimdSize + 33, 1, 33, 
null_prob));
+  }
+}
+
 TYPED_TEST(TestByteStreamSplitEncoding, CheckOnlyDecode) {
   ASSERT_NO_FATAL_FAILURE(this->CheckDecode());
 }

Reply via email to