This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new f40bf77517 GH-37453: [C++][Parquet] Performance fix for WriteBatch 
(#37454)
f40bf77517 is described below

commit f40bf7751772276134c46cae7ac25cef2cdad560
Author: Adam Reeve <[email protected]>
AuthorDate: Wed Aug 30 21:57:24 2023 +1200

    GH-37453: [C++][Parquet] Performance fix for WriteBatch (#37454)
    
    ### Rationale for this change
    
    Reduces the time taken for `TypedColumnWriter::WriteBatch`, which regressed 
with #35230
    
    ### What changes are included in this PR?
    
    This change computes the value for `pages_change_on_record_boundaries` once 
when a `TypedColumnWriter` is constructed rather than on every call to 
`WriteBatch`.
    
    ### Are these changes tested?
    
    This doesn't change behaviour so should be covered by existing tests.
    
    ### Are there any user-facing changes?
    
    No
    * Closes: #37453
    
    Authored-by: Adam Reeve <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/column_writer.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index e34420b9f6..3fca5542a0 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -1219,6 +1219,9 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, 
public TypedColumnWriter<
       page_statistics_ = MakeStatistics<DType>(descr_, allocator_);
       chunk_statistics_ = MakeStatistics<DType>(descr_, allocator_);
     }
+    pages_change_on_record_boundaries_ =
+        properties->data_page_version() == ParquetDataPageVersion::V2 ||
+        properties->page_index_enabled(descr_->path());
   }
 
   int64_t Close() override { return ColumnWriterImpl::Close(); }
@@ -1386,8 +1389,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, 
public TypedColumnWriter<
   const WriterProperties* properties() override { return properties_; }
 
   bool pages_change_on_record_boundaries() const {
-    return properties_->data_page_version() == ParquetDataPageVersion::V2 ||
-           properties_->page_index_enabled(descr_->path());
+    return pages_change_on_record_boundaries_;
   }
 
  private:
@@ -1402,6 +1404,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, 
public TypedColumnWriter<
   DictEncoder<DType>* current_dict_encoder_;
   std::shared_ptr<TypedStats> page_statistics_;
   std::shared_ptr<TypedStats> chunk_statistics_;
+  bool pages_change_on_record_boundaries_;
 
   // If writing a sequence of ::arrow::DictionaryArray to the writer, we keep 
the
   // dictionary passed to DictEncoder<T>::PutDictionary so we can check

Reply via email to