This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 60708150f1 GH-38432: [C++][Parquet] Try to fix performance regression 
in the DictByteArrayDecoderImpl (#38784)
60708150f1 is described below

commit 60708150f1e668ccf51302959921d3a9977d7118
Author: mwish <[email protected]>
AuthorDate: Tue Nov 28 00:50:54 2023 +0800

    GH-38432: [C++][Parquet] Try to fix performance regression in the 
DictByteArrayDecoderImpl (#38784)
    
    
    
    ### Rationale for this change
    
    Do some changes mentioned in https://github.com/apache/arrow/issues/38432
    
    I believe this might fix https://github.com/apache/arrow/issues/38577
    
    Problem1:
    
    The `BinaryHelper` might call `Prepare()` and 
`Prepare(estimated-output-binary-length)` for data. This might because:
    
    1. For Plain Encoding ByteArray, the `len_` is similar to the data-page 
size, so `Reserve` is related.
    2. For Dict Encoding. The Data Page is just a RLE encoding Page, it's 
`len_` might didn't directly related to output-binary.
    
    Problem2:
    
    `Prepare` using `::arrow::kBinaryMemoryLimit` as min-value, we should use 
`this->chunk_space_remaining_`.
    
    Problem3:
    
    `std::optional<int64_t>` is hard to optimize for some compilers
    
    ### What changes are included in this PR?
    
    Mention the behavior of BinaryHelper. And trying to fix it.
    
    ### Are these changes tested?
    
    No
    
    ### Are there any user-facing changes?
    
    Regression fixes
    
    * Closes: #38432
    
    Lead-authored-by: mwish <[email protected]>
    Co-authored-by: mwish <[email protected]>
    Co-authored-by: Gang Wu <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/encoding.cc | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 1bb487c20d..04c6935d2d 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1196,24 +1196,40 @@ struct ArrowBinaryHelper<ByteArrayType> {
         chunk_space_remaining_(::arrow::kBinaryMemoryLimit -
                                acc_->builder->value_data_length()) {}
 
+  // Prepare will reserve the number of entries remaining in the current chunk.
+  // If estimated_data_length is provided, it will also reserve the estimated 
data length,
+  // and the caller should better call `UnsafeAppend` instead of `Append` to 
avoid
+  // double-checking the data length.
   Status Prepare(std::optional<int64_t> estimated_data_length = {}) {
     RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_));
     if (estimated_data_length.has_value()) {
       RETURN_NOT_OK(acc_->builder->ReserveData(
-          std::min<int64_t>(*estimated_data_length, 
::arrow::kBinaryMemoryLimit)));
+          std::min<int64_t>(*estimated_data_length, 
this->chunk_space_remaining_)));
     }
     return Status::OK();
   }
 
+  Status PrepareNextInput(int64_t next_value_length) {
+    if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) {
+      // This element would exceed the capacity of a chunk
+      RETURN_NOT_OK(PushChunk());
+      RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_));
+    }
+    return Status::OK();
+  }
+
+  // If estimated_remaining_data_length is provided, it will also reserve the 
estimated
+  // data length, and the caller should better call `UnsafeAppend` instead of
+  // `Append` to avoid double-checking the data length.
   Status PrepareNextInput(int64_t next_value_length,
-                          std::optional<int64_t> 
estimated_remaining_data_length = {}) {
+                          int64_t estimated_remaining_data_length) {
     if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) {
       // This element would exceed the capacity of a chunk
       RETURN_NOT_OK(PushChunk());
       RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_));
-      if (estimated_remaining_data_length.has_value()) {
+      if (estimated_remaining_data_length) {
         RETURN_NOT_OK(acc_->builder->ReserveData(
-            std::min<int64_t>(*estimated_remaining_data_length, 
chunk_space_remaining_)));
+            std::min<int64_t>(estimated_remaining_data_length, 
chunk_space_remaining_)));
       }
     }
     return Status::OK();
@@ -1271,8 +1287,10 @@ struct ArrowBinaryHelper<FLBAType> {
     return acc_->Reserve(entries_remaining_);
   }
 
+  Status PrepareNextInput(int64_t next_value_length) { return Status::OK(); }
+
   Status PrepareNextInput(int64_t next_value_length,
-                          std::optional<int64_t> 
estimated_remaining_data_length = {}) {
+                          int64_t estimated_remaining_data_length) {
     return Status::OK();
   }
 
@@ -1915,6 +1933,9 @@ class DictByteArrayDecoderImpl : public 
DictDecoderImpl<ByteArrayType>,
     int32_t indices[kBufferSize];
 
     ArrowBinaryHelper<ByteArrayType> helper(out, num_values);
+    // The `len_` in the ByteArrayDictDecoder is the total length of the
+    // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve
+    // space for binary data.
     RETURN_NOT_OK(helper.Prepare());
 
     auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
@@ -1983,7 +2004,10 @@ class DictByteArrayDecoderImpl : public 
DictDecoderImpl<ByteArrayType>,
     int values_decoded = 0;
 
     ArrowBinaryHelper<ByteArrayType> helper(out, num_values);
-    RETURN_NOT_OK(helper.Prepare(len_));
+    // The `len_` in the ByteArrayDictDecoder is the total length of the
+    // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve
+    // space for binary data.
+    RETURN_NOT_OK(helper.Prepare());
 
     auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
 

Reply via email to