This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 80db102088 GH-48467: [C++][Parquet] Add BufferedStats API to
RowGroupWriter (#49527)
80db102088 is described below
commit 80db1020881f461af3a300653cfd2333ef10a45e
Author: Wechar Yu <[email protected]>
AuthorDate: Fri Mar 27 22:34:16 2026 +0800
GH-48467: [C++][Parquet] Add BufferedStats API to RowGroupWriter (#49527)
### Rationale for this change
Expose an API for buffered bytes of values and levels in RowGroupWriter,
it's useful in deciding whether a new row group is needed.
Discussion in
https://github.com/apache/arrow/pull/48468#issuecomment-4047682751
### What changes are included in this PR?
Add a new API to return `BufferedStats` from `RowGroupWriter`.
### Are these changes tested?
Test locally.
### Are there any user-facing changes?
Yes, user could use the new API to implement their customized row group
strategy.
* GitHub Issue: #48467
Lead-authored-by: wecharyu <[email protected]>
Co-authored-by: Gang Wu <[email protected]>
Co-authored-by: Zehua Zou <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
---
cpp/src/parquet/column_writer.cc | 15 +++++++++++++++
cpp/src/parquet/column_writer.h | 9 +++++++++
cpp/src/parquet/file_writer.cc | 20 ++++++++++++++++++++
cpp/src/parquet/file_writer.h | 15 +++++++++++++++
4 files changed, 59 insertions(+)
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 1f1197f95a..61507c2ba1 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -1489,6 +1489,21 @@ class TypedColumnWriterImpl : public ColumnWriterImpl,
return current_encoder_->EstimatedDataEncodedSize();
}
+ int64_t estimated_buffered_def_level_bytes() const override {
+ return definition_levels_sink_.length();
+ }
+
+ int64_t estimated_buffered_rep_level_bytes() const override {
+ return repetition_levels_sink_.length();
+ }
+
+ int64_t estimated_buffered_dict_bytes() const override {
+ if (current_dict_encoder_) {
+ return current_dict_encoder_->dict_encoded_size();
+ }
+ return 0;
+ }
+
protected:
std::shared_ptr<Buffer> GetValuesBuffer() override {
return current_encoder_->FlushValues();
diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h
index e0d4b5234b..5ad58c5ecf 100644
--- a/cpp/src/parquet/column_writer.h
+++ b/cpp/src/parquet/column_writer.h
@@ -165,6 +165,15 @@ class PARQUET_EXPORT ColumnWriter {
/// \brief Estimated size of the values that are not written to a page yet.
virtual int64_t estimated_buffered_value_bytes() const = 0;
+ /// \brief Estimated size of the definition levels that are not written to a
page yet.
+ virtual int64_t estimated_buffered_def_level_bytes() const = 0;
+
+ /// \brief Estimated size of the repetition levels that are not written to a
page yet.
+ virtual int64_t estimated_buffered_rep_level_bytes() const = 0;
+
+ /// \brief Estimated size of the dictionary that are not written to a page
yet.
+ virtual int64_t estimated_buffered_dict_bytes() const = 0;
+
/// \brief The file-level writer properties
virtual const WriterProperties* properties() = 0;
diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc
index 842e667e8a..ec303408f3 100644
--- a/cpp/src/parquet/file_writer.cc
+++ b/cpp/src/parquet/file_writer.cc
@@ -69,6 +69,10 @@ int64_t RowGroupWriter::total_compressed_bytes_written()
const {
return contents_->total_compressed_bytes_written();
}
+RowGroupWriter::BufferedStats RowGroupWriter::estimated_buffered_stats() const
{
+ return contents_->EstimatedBufferedStats();
+}
+
bool RowGroupWriter::buffered() const { return contents_->buffered(); }
int RowGroupWriter::current_column() { return contents_->current_column(); }
@@ -198,6 +202,22 @@ class RowGroupSerializer : public RowGroupWriter::Contents
{
return total_compressed_bytes_written;
}
+ RowGroupWriter::BufferedStats EstimatedBufferedStats() const override {
+ RowGroupWriter::BufferedStats stats;
+ if (closed_) {
+ return stats;
+ }
+ for (const auto& column_writer : column_writers_) {
+ if (column_writer) {
+ stats.def_level_bytes +=
column_writer->estimated_buffered_def_level_bytes();
+ stats.rep_level_bytes +=
column_writer->estimated_buffered_rep_level_bytes();
+ stats.value_bytes += column_writer->estimated_buffered_value_bytes();
+ stats.dict_bytes += column_writer->estimated_buffered_dict_bytes();
+ }
+ }
+ return stats;
+ }
+
bool buffered() const override { return buffered_row_group_; }
void Close() override {
diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h
index d5ea1d7c98..636a7e9957 100644
--- a/cpp/src/parquet/file_writer.h
+++ b/cpp/src/parquet/file_writer.h
@@ -36,6 +36,15 @@ static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R',
'E'};
class PARQUET_EXPORT RowGroupWriter {
public:
+ // Estimated uncompressed byte sizes of data buffered by column writers
+ // that have not yet been serialized into pages.
+ struct BufferedStats {
+ int64_t def_level_bytes = 0;
+ int64_t rep_level_bytes = 0;
+ int64_t value_bytes = 0;
+ int64_t dict_bytes = 0;
+ };
+
// Forward declare a virtual class 'Contents' to aid dependency injection
and more
// easily create test fixtures
// An implementation of the Contents class is defined in the .cc file
@@ -58,6 +67,9 @@ class PARQUET_EXPORT RowGroupWriter {
virtual int64_t total_compressed_bytes() const = 0;
/// \brief total compressed bytes written by the page writer
virtual int64_t total_compressed_bytes_written() const = 0;
+ /// \brief Estimated sizes of buffered data (levels, values, dict) not yet
+ /// written to pages.
+ virtual BufferedStats EstimatedBufferedStats() const = 0;
virtual bool buffered() const = 0;
};
@@ -99,6 +111,9 @@ class PARQUET_EXPORT RowGroupWriter {
int64_t total_compressed_bytes() const;
/// \brief total compressed bytes written by the page writer
int64_t total_compressed_bytes_written() const;
+ /// \brief Estimated sizes of buffered data (levels, values, dict) not yet
+ /// written to pages.
+ BufferedStats estimated_buffered_stats() const;
/// Returns whether the current RowGroupWriter is in the buffered mode and
is created
/// by calling ParquetFileWriter::AppendBufferedRowGroup.