This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new b31977fd22 GH-36773: [C++][Parquet] Avoid calculating prebuffer column 
bitmap multiple times (#36774)
b31977fd22 is described below

commit b31977fd22afdb44cc8344f7814d1f6cd507a964
Author: mwish <[email protected]>
AuthorDate: Mon Jul 24 23:55:55 2023 +0800

    GH-36773: [C++][Parquet] Avoid calculating prebuffer column bitmap multiple 
times (#36774)
    
    
    
    ### Rationale for this change
    
    According to https://github.com/apache/arrow/pull/36192 and 
https://github.com/apache/arrow/pull/36649 . RowGroupReader using a bitmap to 
control a column-level prebuffer.
    
    However, if all columns are selected, this will be a heavy overhead for 
building a bitmap multiple times.
    
    ### What changes are included in this PR?
    
    Build `Prebuffer` Bitmap once, and reuse that vector.
    
    ### Are these changes tested?
    
    no
    
    ### Are there any user-facing changes?
    
    no
    
    * Closes: #36773
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/file_reader.cc | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index adda9a027b..08d493b0bc 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -179,17 +179,17 @@ class SerializedRowGroup : public 
RowGroupReader::Contents {
   SerializedRowGroup(std::shared_ptr<ArrowInputFile> source,
                      std::shared_ptr<::arrow::io::internal::ReadRangeCache> 
cached_source,
                      int64_t source_size, FileMetaData* file_metadata,
-                     int row_group_number, const ReaderProperties& props,
+                     int row_group_number, ReaderProperties props,
                      std::shared_ptr<Buffer> prebuffered_column_chunks_bitmap,
                      std::shared_ptr<InternalFileDecryptor> file_decryptor = 
nullptr)
       : source_(std::move(source)),
         cached_source_(std::move(cached_source)),
         source_size_(source_size),
         file_metadata_(file_metadata),
-        properties_(props),
+        properties_(std::move(props)),
         row_group_ordinal_(row_group_number),
         
prebuffered_column_chunks_bitmap_(std::move(prebuffered_column_chunks_bitmap)),
-        file_decryptor_(file_decryptor) {
+        file_decryptor_(std::move(file_decryptor)) {
     row_group_metadata_ = file_metadata->RowGroup(row_group_number);
   }
 
@@ -273,7 +273,7 @@ class SerializedRowGroup : public RowGroupReader::Contents {
   std::unique_ptr<RowGroupMetaData> row_group_metadata_;
   ReaderProperties properties_;
   int row_group_ordinal_;
-  const std::shared_ptr<Buffer> prebuffered_column_chunks_bitmap_;
+  const std::shared_ptr<const Buffer> prebuffered_column_chunks_bitmap_;
   std::shared_ptr<InternalFileDecryptor> file_decryptor_;
 };
 
@@ -366,13 +366,19 @@ class SerializedFile : public ParquetFileReader::Contents 
{
         std::make_shared<::arrow::io::internal::ReadRangeCache>(source_, ctx, 
options);
     std::vector<::arrow::io::ReadRange> ranges;
     prebuffered_column_chunks_.clear();
+    int num_cols = file_metadata_->num_columns();
+    // a bitmap for buffered columns.
+    std::shared_ptr<Buffer> buffer_columns;
+    if (!row_groups.empty()) {
+      PARQUET_THROW_NOT_OK(AllocateEmptyBitmap(num_cols, 
properties_.memory_pool())
+                               .Value(&buffer_columns));
+      for (int col : column_indices) {
+        ::arrow::bit_util::SetBit(buffer_columns->mutable_data(), col);
+      }
+    }
     for (int row : row_groups) {
-      std::shared_ptr<Buffer>& col_bitmap = prebuffered_column_chunks_[row];
-      int num_cols = file_metadata_->num_columns();
-      PARQUET_THROW_NOT_OK(
-          AllocateEmptyBitmap(num_cols, 
properties_.memory_pool()).Value(&col_bitmap));
+      prebuffered_column_chunks_[row] = buffer_columns;
       for (int col : column_indices) {
-        ::arrow::bit_util::SetBit(col_bitmap->mutable_data(), col);
         ranges.push_back(
             ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, 
col));
       }

Reply via email to