This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new f40bf77517 GH-37453: [C++][Parquet] Performance fix for WriteBatch
(#37454)
f40bf77517 is described below
commit f40bf7751772276134c46cae7ac25cef2cdad560
Author: Adam Reeve <[email protected]>
AuthorDate: Wed Aug 30 21:57:24 2023 +1200
GH-37453: [C++][Parquet] Performance fix for WriteBatch (#37454)
### Rationale for this change
Reduces the time taken for `TypedColumnWriter::WriteBatch`, which regressed
with #35230
### What changes are included in this PR?
This change computes the value for `pages_change_on_record_boundaries` once
when a `TypedColumnWriter` is constructed rather than on every call to
`WriteBatch`.
### Are these changes tested?
This doesn't change behaviour so should be covered by existing tests.
### Are there any user-facing changes?
No
* Closes: #37453
Authored-by: Adam Reeve <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/column_writer.cc | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index e34420b9f6..3fca5542a0 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -1219,6 +1219,9 @@ class TypedColumnWriterImpl : public ColumnWriterImpl,
public TypedColumnWriter<
page_statistics_ = MakeStatistics<DType>(descr_, allocator_);
chunk_statistics_ = MakeStatistics<DType>(descr_, allocator_);
}
+ pages_change_on_record_boundaries_ =
+ properties->data_page_version() == ParquetDataPageVersion::V2 ||
+ properties->page_index_enabled(descr_->path());
}
int64_t Close() override { return ColumnWriterImpl::Close(); }
@@ -1386,8 +1389,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl,
public TypedColumnWriter<
const WriterProperties* properties() override { return properties_; }
bool pages_change_on_record_boundaries() const {
- return properties_->data_page_version() == ParquetDataPageVersion::V2 ||
- properties_->page_index_enabled(descr_->path());
+ return pages_change_on_record_boundaries_;
}
private:
@@ -1402,6 +1404,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl,
public TypedColumnWriter<
DictEncoder<DType>* current_dict_encoder_;
std::shared_ptr<TypedStats> page_statistics_;
std::shared_ptr<TypedStats> chunk_statistics_;
+ bool pages_change_on_record_boundaries_;
// If writing a sequence of ::arrow::DictionaryArray to the writer, we keep
the
// dictionary passed to DictEncoder<T>::PutDictionary so we can check