This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 51689a040c GH-41545: [C++][Parquet] Fix 
DeltaLengthByteArrayEncoder::EstimatedDataEncodedSize (#41546)
51689a040c is described below

commit 51689a040cbe3dee8702cd899a33fa62e0616bf1
Author: mwish <[email protected]>
AuthorDate: Wed May 8 00:14:22 2024 +0800

    GH-41545: [C++][Parquet] Fix 
DeltaLengthByteArrayEncoder::EstimatedDataEncodedSize (#41546)
    
    
    
    ### Rationale for this change
    
    `DeltaLengthByteArrayEncoder::EstimatedDataEncodedSize` would return an 
wrong estimate when `Put(const Array&)` was called.
    
    ### What changes are included in this PR?
    
    Remove `encoded_size_` and uses `sink_.length()` as `encoded_size_`.
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    No
    
    * GitHub Issue: #41545
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/encoding.cc      | 18 ++++++++++--------
 cpp/src/parquet/encoding_test.cc |  9 +++++++++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 05221568c8..004cb746b3 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2740,13 +2740,12 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl,
       : EncoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY,
                     pool = ::arrow::default_memory_pool()),
         sink_(pool),
-        length_encoder_(nullptr, pool),
-        encoded_size_{0} {}
+        length_encoder_(nullptr, pool) {}
 
   std::shared_ptr<Buffer> FlushValues() override;
 
   int64_t EstimatedDataEncodedSize() override {
-    return encoded_size_ + length_encoder_.EstimatedDataEncodedSize();
+    return sink_.length() + length_encoder_.EstimatedDataEncodedSize();
   }
 
   using TypedEncoder<ByteArrayType>::Put;
@@ -2768,6 +2767,11 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl,
             return Status::Invalid(
                 "Parquet cannot store strings with size 2GB or more, got: ", 
view.size());
           }
+          if (ARROW_PREDICT_FALSE(
+                  view.size() + sink_.length() >
+                  static_cast<size_t>(std::numeric_limits<int32_t>::max()))) {
+            return Status::Invalid("excess expansion in 
DELTA_LENGTH_BYTE_ARRAY");
+          }
           length_encoder_.Put({static_cast<int32_t>(view.length())}, 1);
           PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length()));
           return Status::OK();
@@ -2777,7 +2781,6 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl,
 
   ::arrow::BufferBuilder sink_;
   DeltaBitPackEncoder<Int32Type> length_encoder_;
-  uint32_t encoded_size_;
 };
 
 template <typename DType>
@@ -2803,15 +2806,15 @@ void DeltaLengthByteArrayEncoder<DType>::Put(const T* 
src, int num_values) {
     const int batch_size = std::min(kBatchSize, num_values - idx);
     for (int j = 0; j < batch_size; ++j) {
       const int32_t len = src[idx + j].len;
-      if (AddWithOverflow(total_increment_size, len, &total_increment_size)) {
+      if (ARROW_PREDICT_FALSE(
+              AddWithOverflow(total_increment_size, len, 
&total_increment_size))) {
         throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
       }
       lengths[j] = len;
     }
     length_encoder_.Put(lengths.data(), batch_size);
   }
-
-  if (AddWithOverflow(encoded_size_, total_increment_size, &encoded_size_)) {
+  if (sink_.length() + total_increment_size > 
std::numeric_limits<int32_t>::max()) {
     throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
   }
   PARQUET_THROW_NOT_OK(sink_.Reserve(total_increment_size));
@@ -2850,7 +2853,6 @@ std::shared_ptr<Buffer> 
DeltaLengthByteArrayEncoder<DType>::FlushValues() {
 
   std::shared_ptr<Buffer> buffer;
   PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true));
-  encoded_size_ = 0;
   return buffer;
 }
 
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index 3c20b917f6..78bf26587e 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -577,6 +577,11 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) {
     auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::PLAIN);
 
     ASSERT_NO_THROW(encoder->Put(*values));
+    // For Plain encoding, the estimated size should be at least the total 
byte size
+    auto& string_array = dynamic_cast<const ::arrow::StringArray&>(*values);
+    EXPECT_GE(encoder->EstimatedDataEncodedSize(), 
string_array.total_values_length())
+        << "Estimated size should be at least the total byte size";
+
     auto buf = encoder->FlushValues();
 
     int num_values = static_cast<int>(values->length() - values->null_count());
@@ -2160,6 +2165,10 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, 
ArrowBinaryDirectPut) {
 
   auto CheckSeed = [&](std::shared_ptr<::arrow::Array> values) {
     ASSERT_NO_THROW(encoder->Put(*values));
+    auto* binary_array = checked_cast<const 
::arrow::BinaryArray*>(values.get());
+    // For DeltaLength encoding, the estimated size should be at least the 
total byte size
+    EXPECT_GE(encoder->EstimatedDataEncodedSize(), 
binary_array->total_values_length())
+        << "Estimated size should be at least the total byte size";
     auto buf = encoder->FlushValues();
 
     int num_values = static_cast<int>(values->length() - values->null_count());

Reply via email to