This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 29a0581f5b GH-39870: [Go] Include buffered pages in TotalBytesWritten
(#40105)
29a0581f5b is described below
commit 29a0581f5bfcad86a6493854f8be8fcb6ffe2fbc
Author: Matthew McNew <[email protected]>
AuthorDate: Tue Feb 20 19:59:57 2024 -0600
GH-39870: [Go] Include buffered pages in TotalBytesWritten (#40105)
### Rationale for this change
Currently, buffered data pages are not included in TotalBytesWritten this
means that their is not an accurate estimate of the size of the current size.
### Are there any user-facing changes?
`RowGroupTotalBytesWritten` will include the TotalBytes in buffered
DataPages minus the buffered data pages headers.
* Closes: #39870
Authored-by: Matthew McNew <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
---
go/parquet/file/column_writer.go | 7 ++++++-
go/parquet/file/column_writer_test.go | 14 ++++++++++----
2 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/go/parquet/file/column_writer.go b/go/parquet/file/column_writer.go
index 4d603c547c..91f5d18942 100755
--- a/go/parquet/file/column_writer.go
+++ b/go/parquet/file/column_writer.go
@@ -198,7 +198,12 @@ func (w *columnWriter) TotalCompressedBytes() int64 {
}
func (w *columnWriter) TotalBytesWritten() int64 {
- return w.totalBytesWritten
+ bufferedPagesBytes := int64(0)
+ for _, p := range w.pages {
+ bufferedPagesBytes += int64(len(p.Data()))
+ }
+
+ return w.totalBytesWritten + bufferedPagesBytes
}
func (w *columnWriter) RowsWritten() int {
diff --git a/go/parquet/file/column_writer_test.go
b/go/parquet/file/column_writer_test.go
index dd597e280b..d78e1c6761 100755
--- a/go/parquet/file/column_writer_test.go
+++ b/go/parquet/file/column_writer_test.go
@@ -430,6 +430,11 @@ func (p *PrimitiveWriterTestSuite)
testDictionaryFallbackEncoding(version parque
}
func (p *PrimitiveWriterTestSuite)
testDictionaryFallbackAndCompressedSize(version parquet.Version) {
+ // skip boolean as dictionary encoding is not used
+ if p.Typ.Kind() == reflect.Bool {
+ return
+ }
+
p.GenerateData(SmallSize)
props := parquet.DefaultColumnProperties()
props.DictionaryEnabled = true
@@ -440,13 +445,14 @@ func (p *PrimitiveWriterTestSuite)
testDictionaryFallbackAndCompressedSize(versi
props.Encoding = parquet.Encodings.RLEDict
}
- writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version))
+ writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version),
parquet.WithDataPageSize(SmallSize-1))
p.WriteBatchValues(writer, nil, nil)
+ p.NotZero(writer.TotalBytesWritten())
writer.FallbackToPlain()
- p.NotEqual(0, writer.TotalCompressedBytes())
+ p.NotZero(writer.TotalCompressedBytes())
writer.Close()
- p.NotEqual(0, writer.TotalCompressedBytes())
- p.NotEqual(0, writer.TotalBytesWritten())
+ p.NotZero(writer.TotalCompressedBytes())
+ p.NotZero(writer.TotalBytesWritten())
}
func (p *PrimitiveWriterTestSuite) TestRequiredPlain() {