This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 4bbd1332 feat(writer): support length() for both open and closed
writers (#433)
4bbd1332 is described below
commit 4bbd133256b397e10dda22984d663e0194b7bbb3
Author: Gang Wu <[email protected]>
AuthorDate: Wed Dec 24 16:13:20 2025 +0800
feat(writer): support length() for both open and closed writers (#433)
Modified Writer::length() to return current flushed position when open
and final file length when closed. Previously length() required the
writer to be closed.
---
src/iceberg/avro/avro_writer.cc | 17 ++++++++++-------
src/iceberg/parquet/parquet_writer.cc | 18 +++++++++++-------
2 files changed, 21 insertions(+), 14 deletions(-)
diff --git a/src/iceberg/avro/avro_writer.cc b/src/iceberg/avro/avro_writer.cc
index 9fec43a1..9d65db15 100644
--- a/src/iceberg/avro/avro_writer.cc
+++ b/src/iceberg/avro/avro_writer.cc
@@ -120,7 +120,14 @@ class AvroWriter::Impl {
bool Closed() const { return writer_ == nullptr; }
- int64_t length() { return total_bytes_; }
+ Result<int64_t> length() {
+ if (Closed()) {
+ return total_bytes_;
+ }
+ // Return current flushed length when writer is still open
+ ICEBERG_ARROW_ASSIGN_OR_RETURN(auto current_pos,
arrow_output_stream_->Tell());
+ return current_pos;
+ }
private:
// The schema to write.
@@ -135,6 +142,7 @@ class AvroWriter::Impl {
std::unique_ptr<::avro::GenericDatum> datum_;
// Arrow schema to write data.
ArrowSchema arrow_schema_;
+ // Total length of the written Avro file.
int64_t total_bytes_ = 0;
};
@@ -162,12 +170,7 @@ Result<Metrics> AvroWriter::metrics() {
return Invalid("AvroWriter is not closed");
}
-Result<int64_t> AvroWriter::length() {
- if (impl_->Closed()) {
- return impl_->length();
- }
- return Invalid("AvroWriter is not closed");
-}
+Result<int64_t> AvroWriter::length() { return impl_->length(); }
std::vector<int64_t> AvroWriter::split_offsets() { return {}; }
diff --git a/src/iceberg/parquet/parquet_writer.cc
b/src/iceberg/parquet/parquet_writer.cc
index 44e6133d..886348f0 100644
--- a/src/iceberg/parquet/parquet_writer.cc
+++ b/src/iceberg/parquet/parquet_writer.cc
@@ -107,7 +107,16 @@ class ParquetWriter::Impl {
bool Closed() const { return writer_ == nullptr; }
- int64_t length() const { return total_bytes_; }
+ Result<int64_t> length() {
+ if (Closed()) {
+ return total_bytes_;
+ }
+ // Return current flushed length when writer is still open.
+ // It would be good if we could get the number of buffered bytes
+ // from the internal RowGroupWriter.
+ ICEBERG_ARROW_ASSIGN_OR_RETURN(auto current_pos, output_stream_->Tell());
+ return current_pos;
+ }
std::vector<int64_t> split_offsets() const { return split_offsets_; }
@@ -144,12 +153,7 @@ Result<Metrics> ParquetWriter::metrics() {
return {};
}
-Result<int64_t> ParquetWriter::length() {
- if (!impl_->Closed()) {
- return Invalid("ParquetWriter is not closed");
- }
- return impl_->length();
-}
+Result<int64_t> ParquetWriter::length() { return impl_->length(); }
std::vector<int64_t> ParquetWriter::split_offsets() {
if (!impl_->Closed()) {