[arrow] branch main updated: GH-37939: [C++] Use signed arithmetic for frame of reference when encoding DELTA_BINARY_PACKED (#37940)

apitrou Tue, 03 Oct 2023 12:15:07 -0700

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 5514b223eb GH-37939: [C++] Use signed arithmetic for frame of 
reference when encoding DELTA_BINARY_PACKED (#37940)
5514b223eb is described below

commit 5514b223ebc1345c7bbd501fe3cc57dcac668e86
Author: Ed Seidl <[email protected]>
AuthorDate: Tue Oct 3 12:14:50 2023 -0700

    GH-37939: [C++] Use signed arithmetic for frame of reference when encoding 
DELTA_BINARY_PACKED (#37940)
    
    Closes #37939.
    
    ### What changes are included in this PR?
    
    This PR changes values used in the `DELTA_BINARY_PACKED` encoder to signed 
types. To gracefully handle overflow, arithmetic is still performed in the 
unsigned domain, but other operations such as computing the min and max deltas 
are done in the signed domain.
    
    Using signed types ensures the optimal number of bits is used when encoding 
the deltas, which was not the case before if any negative deltas were 
encountered (which is obviously common).
    
    ### Are these changes tested?
    I've included two tests that result in overflow.
    
    ### Are there any user-facing changes?
    No
    
    * Closes: #37939
    
    Authored-by: seidl <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/encoding.cc      | 27 +++++++++--------
 cpp/src/parquet/encoding_test.cc | 64 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 9f705768e4..931b9fd107 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -57,6 +57,7 @@ using arrow::VisitNullBitmapInline;
 using arrow::internal::AddWithOverflow;
 using arrow::internal::checked_cast;
 using arrow::internal::MultiplyWithOverflow;
+using arrow::internal::SafeSignedSubtract;
 using arrow::internal::SubtractWithOverflow;
 using arrow::util::SafeLoad;
 using arrow::util::SafeLoadAs;
@@ -2189,9 +2190,9 @@ class DeltaBitPackEncoder : public EncoderImpl, virtual 
public TypedEncoder<DTyp
   const uint32_t values_per_mini_block_;
   uint32_t values_current_block_{0};
   uint32_t total_value_count_{0};
-  UT first_value_{0};
-  UT current_value_{0};
-  ArrowPoolVector<UT> deltas_;
+  T first_value_{0};
+  T current_value_{0};
+  ArrowPoolVector<T> deltas_;
   std::shared_ptr<ResizableBuffer> bits_buffer_;
   ::arrow::BufferBuilder sink_;
   ::arrow::bit_util::BitWriter bit_writer_;
@@ -2212,12 +2213,12 @@ void DeltaBitPackEncoder<DType>::Put(const T* src, int 
num_values) {
   total_value_count_ += num_values;
 
   while (idx < num_values) {
-    UT value = static_cast<UT>(src[idx]);
+    T value = src[idx];
     // Calculate deltas. The possible overflow is handled by use of unsigned 
integers
     // making subtraction operations well-defined and correct even in case of 
overflow.
     // Encoded integers will wrap back around on decoding.
     // See http://en.wikipedia.org/wiki/Modular_arithmetic#Integers_modulo_n
-    deltas_[values_current_block_] = value - current_value_;
+    deltas_[values_current_block_] = SafeSignedSubtract(value, current_value_);
     current_value_ = value;
     idx++;
     values_current_block_++;
@@ -2233,9 +2234,11 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
     return;
   }
 
-  const UT min_delta =
+  // Calculate the frame of reference for this miniblock. This value will be 
subtracted
+  // from all deltas to guarantee all deltas are positive for encoding.
+  const T min_delta =
       *std::min_element(deltas_.begin(), deltas_.begin() + 
values_current_block_);
-  bit_writer_.PutZigZagVlqInt(static_cast<T>(min_delta));
+  bit_writer_.PutZigZagVlqInt(min_delta);
 
   // Call to GetNextBytePtr reserves mini_blocks_per_block_ bytes of space to 
write
   // bit widths of miniblocks as they become known during the encoding.
@@ -2250,17 +2253,17 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
         std::min(values_per_mini_block_, values_current_block_);
 
     const uint32_t start = i * values_per_mini_block_;
-    const UT max_delta = *std::max_element(
+    const T max_delta = *std::max_element(
         deltas_.begin() + start, deltas_.begin() + start + 
values_current_mini_block);
 
     // The minimum number of bits required to write any of values in deltas_ 
vector.
     // See overflow comment above.
-    const auto bit_width = bit_width_data[i] =
-        bit_util::NumRequiredBits(max_delta - min_delta);
+    const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits(
+        static_cast<UT>(max_delta) - static_cast<UT>(min_delta));
 
     for (uint32_t j = start; j < start + values_current_mini_block; j++) {
-      // See overflow comment above.
-      const UT value = deltas_[j] - min_delta;
+      // Convert delta to frame of reference. See overflow comment above.
+      const UT value = static_cast<UT>(deltas_[j]) - 
static_cast<UT>(min_delta);
       bit_writer_.PutValue(value, bit_width);
     }
     // If there are not enough values to fill the last mini block, we pad the 
mini block
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index 71dc40d33a..9861c317c8 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -1634,6 +1634,70 @@ TYPED_TEST(TestDeltaBitPackEncoding, 
NonZeroPaddedMiniblockBitWidth) {
   }
 }
 
+// Test that the DELTA_BINARY_PACKED encoding works properply in the presence 
of values
+// that will cause integer overflow (see GH-37939).
+TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedWrapping) {
+  using T = typename TypeParam::c_type;
+
+  // Values that should wrap when converted to deltas, and then when converted 
to the
+  // frame of reference.
+  std::vector<T> int_values = {std::numeric_limits<T>::min(),
+                               std::numeric_limits<T>::max(),
+                               std::numeric_limits<T>::min(),
+                               std::numeric_limits<T>::max(),
+                               0,
+                               -1,
+                               0,
+                               1,
+                               -1,
+                               1};
+  const int num_values = static_cast<int>(int_values.size());
+
+  auto const encoder = MakeTypedEncoder<TypeParam>(
+      Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, 
this->descr_.get());
+  encoder->Put(int_values, num_values);
+  auto const encoded = encoder->FlushValues();
+
+  auto const decoder =
+      MakeTypedDecoder<TypeParam>(Encoding::DELTA_BINARY_PACKED, 
this->descr_.get());
+
+  std::vector<T> decoded(num_values);
+  decoder->SetData(num_values, encoded->data(), 
static_cast<int>(encoded->size()));
+
+  const int values_decoded = decoder->Decode(decoded.data(), num_values);
+
+  ASSERT_EQ(num_values, values_decoded);
+  ASSERT_NO_FATAL_FAILURE(
+      VerifyResults<T>(decoded.data(), int_values.data(), num_values));
+}
+
+// Test that the DELTA_BINARY_PACKED encoding does not use more bits to encode 
than
+// necessary (see GH-37939).
+TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedSize) {
+  using T = typename TypeParam::c_type;
+  constexpr int num_values = 128;
+  // 128 values should be <= 1 block of values encoded with 2 bits
+  // delta header should be 0x8001|0x8002 0x04 0x8001 0x02 (6 bytes)
+  // mini-block header should be 0x01 0x02020202 (5 bytes)
+  constexpr int encoded_size = 2 * num_values / 8 + 6 + 5;
+
+  // Create a run of {1, 0, -1, 0, 1, 0, ...}.
+  // min_delta is -1, max_delta is 1, max_delta - min_delta is 2, so this 
requires 2 bits
+  // to encode.
+  std::vector<T> int_values(num_values);
+  std::iota(int_values.begin(), int_values.end(), 0);
+  std::transform(int_values.begin(), int_values.end(), int_values.begin(), 
[](T idx) {
+    return (idx % 2) == 1 ? 0 : (idx % 4) == 0 ? 1 : -1;
+  });
+
+  auto const encoder = MakeTypedEncoder<TypeParam>(
+      Encoding::DELTA_BINARY_PACKED, /*use_dictionary=*/false, 
this->descr_.get());
+  encoder->Put(int_values, num_values);
+  auto const encoded = encoder->FlushValues();
+
+  ASSERT_EQ(encoded->size(), encoded_size);
+}
+
 // ----------------------------------------------------------------------
 // Rle for Boolean encode/decode tests.

[arrow] branch main updated: GH-37939: [C++] Use signed arithmetic for frame of reference when encoding DELTA_BINARY_PACKED (#37940)

Reply via email to