This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new b31977fd22 GH-36773: [C++][Parquet] Avoid calculating prebuffer column
bitmap multiple times (#36774)
b31977fd22 is described below
commit b31977fd22afdb44cc8344f7814d1f6cd507a964
Author: mwish <[email protected]>
AuthorDate: Mon Jul 24 23:55:55 2023 +0800
GH-36773: [C++][Parquet] Avoid calculating prebuffer column bitmap multiple
times (#36774)
### Rationale for this change
According to https://github.com/apache/arrow/pull/36192 and
https://github.com/apache/arrow/pull/36649 . RowGroupReader using a bitmap to
control a column-level prebuffer.
However, if all columns are selected, this will be a heavy overhead for
building a bitmap multiple times.
### What changes are included in this PR?
Build `Prebuffer` Bitmap once, and reuse that vector.
### Are these changes tested?
no
### Are there any user-facing changes?
no
* Closes: #36773
Authored-by: mwish <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/file_reader.cc | 24 +++++++++++++++---------
1 file changed, 15 insertions(+), 9 deletions(-)
diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index adda9a027b..08d493b0bc 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -179,17 +179,17 @@ class SerializedRowGroup : public
RowGroupReader::Contents {
SerializedRowGroup(std::shared_ptr<ArrowInputFile> source,
std::shared_ptr<::arrow::io::internal::ReadRangeCache>
cached_source,
int64_t source_size, FileMetaData* file_metadata,
- int row_group_number, const ReaderProperties& props,
+ int row_group_number, ReaderProperties props,
std::shared_ptr<Buffer> prebuffered_column_chunks_bitmap,
std::shared_ptr<InternalFileDecryptor> file_decryptor =
nullptr)
: source_(std::move(source)),
cached_source_(std::move(cached_source)),
source_size_(source_size),
file_metadata_(file_metadata),
- properties_(props),
+ properties_(std::move(props)),
row_group_ordinal_(row_group_number),
prebuffered_column_chunks_bitmap_(std::move(prebuffered_column_chunks_bitmap)),
- file_decryptor_(file_decryptor) {
+ file_decryptor_(std::move(file_decryptor)) {
row_group_metadata_ = file_metadata->RowGroup(row_group_number);
}
@@ -273,7 +273,7 @@ class SerializedRowGroup : public RowGroupReader::Contents {
std::unique_ptr<RowGroupMetaData> row_group_metadata_;
ReaderProperties properties_;
int row_group_ordinal_;
- const std::shared_ptr<Buffer> prebuffered_column_chunks_bitmap_;
+ const std::shared_ptr<const Buffer> prebuffered_column_chunks_bitmap_;
std::shared_ptr<InternalFileDecryptor> file_decryptor_;
};
@@ -366,13 +366,19 @@ class SerializedFile : public ParquetFileReader::Contents
{
std::make_shared<::arrow::io::internal::ReadRangeCache>(source_, ctx,
options);
std::vector<::arrow::io::ReadRange> ranges;
prebuffered_column_chunks_.clear();
+ int num_cols = file_metadata_->num_columns();
+ // a bitmap for buffered columns.
+ std::shared_ptr<Buffer> buffer_columns;
+ if (!row_groups.empty()) {
+ PARQUET_THROW_NOT_OK(AllocateEmptyBitmap(num_cols,
properties_.memory_pool())
+ .Value(&buffer_columns));
+ for (int col : column_indices) {
+ ::arrow::bit_util::SetBit(buffer_columns->mutable_data(), col);
+ }
+ }
for (int row : row_groups) {
- std::shared_ptr<Buffer>& col_bitmap = prebuffered_column_chunks_[row];
- int num_cols = file_metadata_->num_columns();
- PARQUET_THROW_NOT_OK(
- AllocateEmptyBitmap(num_cols,
properties_.memory_pool()).Value(&col_bitmap));
+ prebuffered_column_chunks_[row] = buffer_columns;
for (int col : column_indices) {
- ::arrow::bit_util::SetBit(col_bitmap->mutable_data(), col);
ranges.push_back(
ComputeColumnChunkRange(file_metadata_.get(), source_size_, row,
col));
}