This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 29a0581f5b GH-39870: [Go] Include buffered pages in TotalBytesWritten 
(#40105)
29a0581f5b is described below

commit 29a0581f5bfcad86a6493854f8be8fcb6ffe2fbc
Author: Matthew McNew <[email protected]>
AuthorDate: Tue Feb 20 19:59:57 2024 -0600

    GH-39870: [Go] Include buffered pages in TotalBytesWritten (#40105)
    
    
    
    ### Rationale for this change
    
    Currently, buffered data pages are not included in TotalBytesWritten this 
means that their is not an accurate estimate of the size of the current size.
    
    ### Are there any user-facing changes?
    `RowGroupTotalBytesWritten` will include the TotalBytes in buffered 
DataPages minus the buffered data pages headers.
    
    * Closes: #39870
    
    Authored-by: Matthew McNew <[email protected]>
    Signed-off-by: Matt Topol <[email protected]>
---
 go/parquet/file/column_writer.go      |  7 ++++++-
 go/parquet/file/column_writer_test.go | 14 ++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/go/parquet/file/column_writer.go b/go/parquet/file/column_writer.go
index 4d603c547c..91f5d18942 100755
--- a/go/parquet/file/column_writer.go
+++ b/go/parquet/file/column_writer.go
@@ -198,7 +198,12 @@ func (w *columnWriter) TotalCompressedBytes() int64 {
 }
 
 func (w *columnWriter) TotalBytesWritten() int64 {
-       return w.totalBytesWritten
+       bufferedPagesBytes := int64(0)
+       for _, p := range w.pages {
+               bufferedPagesBytes += int64(len(p.Data()))
+       }
+
+       return w.totalBytesWritten + bufferedPagesBytes
 }
 
 func (w *columnWriter) RowsWritten() int {
diff --git a/go/parquet/file/column_writer_test.go 
b/go/parquet/file/column_writer_test.go
index dd597e280b..d78e1c6761 100755
--- a/go/parquet/file/column_writer_test.go
+++ b/go/parquet/file/column_writer_test.go
@@ -430,6 +430,11 @@ func (p *PrimitiveWriterTestSuite) 
testDictionaryFallbackEncoding(version parque
 }
 
 func (p *PrimitiveWriterTestSuite) 
testDictionaryFallbackAndCompressedSize(version parquet.Version) {
+       // skip boolean as dictionary encoding is not used
+       if p.Typ.Kind() == reflect.Bool {
+               return
+       }
+
        p.GenerateData(SmallSize)
        props := parquet.DefaultColumnProperties()
        props.DictionaryEnabled = true
@@ -440,13 +445,14 @@ func (p *PrimitiveWriterTestSuite) 
testDictionaryFallbackAndCompressedSize(versi
                props.Encoding = parquet.Encodings.RLEDict
        }
 
-       writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version))
+       writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version), 
parquet.WithDataPageSize(SmallSize-1))
        p.WriteBatchValues(writer, nil, nil)
+       p.NotZero(writer.TotalBytesWritten())
        writer.FallbackToPlain()
-       p.NotEqual(0, writer.TotalCompressedBytes())
+       p.NotZero(writer.TotalCompressedBytes())
        writer.Close()
-       p.NotEqual(0, writer.TotalCompressedBytes())
-       p.NotEqual(0, writer.TotalBytesWritten())
+       p.NotZero(writer.TotalCompressedBytes())
+       p.NotZero(writer.TotalBytesWritten())
 }
 
 func (p *PrimitiveWriterTestSuite) TestRequiredPlain() {

Reply via email to