This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new fc9fc890 fix(parquet/metadata): do not pre-set hasDistinctCount in 
stat factories (#807)
fc9fc890 is described below

commit fc9fc890a714f28f8a91c72fe732a622db0f76b1
Author: Sai Asish Y <[email protected]>
AuthorDate: Thu May 14 16:46:07 2026 -0700

    fix(parquet/metadata): do not pre-set hasDistinctCount in stat factories 
(#807)
    
    ### Rationale for this change
    
    Fixes #806. The `New*Statistics` factories in
    `parquet/metadata/statistics_types.gen.go` initialize `hasDistinctCount:
    true` at construction time, so every fresh stat object reports a
    distinct count of 0. `Encode()` then unconditionally calls
    `enc.SetDistinctCount(0)`, which makes the writer emit `distinct_count:
    0` in both `ColumnChunk.metadata.statistics` and
    `DataPageHeader.statistics` for every column, even when no distinct
    count has been computed.
    
    The Parquet thrift `Statistics.distinct_count` field is OPTIONAL. Per
    spec semantics, it must be absent when unknown. Emitting 0 conflates "no
    distinct values" with "we don't know" and causes byte-asymmetry against
    parquet-cpp (pyarrow), which leaves the field unset for the same input.
    
    `IncDistinct()` is the only call site that should flip
    `hasDistinctCount` to true, and grepping the repo confirms it is never
    invoked from the writer paths, so flipping the default is safe.
    
    ### What changes are included in this PR?
    
    - Remove `hasDistinctCount: true` from the factory initializer in
    `statistics_types.gen.go.tmpl` and regenerate `statistics_types.gen.go`.
    `hasNullCount: true` is kept as-is (its semantics differ: `IncNulls` is
    called per page and `null_count: 0` is meaningful for a writer that
    observes zero nulls).
    - Add `TestNewStatisticsDistinctCountUnset` covering
    Int32/Int64/Float32/Float64/Boolean/ByteArray, asserting both
    `HasDistinctCount()` and `Encode().HasDistinctCount` start false and
    flip to true after `IncDistinct`.
    
    ### Are these changes tested?
    
    Yes. The new test fails on master (HasDistinctCount returns true on a
    fresh stat object) and passes after the fix. `go test
    ./parquet/metadata/...` is green.
    
    ### Are there any user-facing changes?
    
    Yes, writers that previously emitted `distinct_count: 0` for every
    column will now omit the field, matching the spec and parquet-cpp.
    Readers already tolerate both, so this is a wire-format normalization,
    not a breaking change for downstream consumers.
---
 parquet/metadata/statistics_test.go           | 37 ++++++++++
 parquet/metadata/statistics_types.gen.go      | 99 ++++++++++++---------------
 parquet/metadata/statistics_types.gen.go.tmpl |  1 -
 parquet/pqarrow/file_writer_test.go           |  8 +--
 4 files changed, 86 insertions(+), 59 deletions(-)

diff --git a/parquet/metadata/statistics_test.go 
b/parquet/metadata/statistics_test.go
index d312437a..a0d107fa 100644
--- a/parquet/metadata/statistics_test.go
+++ b/parquet/metadata/statistics_test.go
@@ -603,3 +603,40 @@ func TestBooleanStatisticsUpdateFromBitmapSpaced(t 
*testing.T) {
                assert.Equal(t, int64(8), stats.NullCount())
        })
 }
+
+func TestNewStatisticsDistinctCountUnset(t *testing.T) {
+       // Issue #806: factories must not pre-set hasDistinctCount, otherwise
+       // Encode() emits distinct_count=0 for every column instead of leaving 
the
+       // optional Thrift field absent. IncDistinct() is responsible for 
marking
+       // it set when an actual distinct count is computed.
+       mk := func(n *schema.PrimitiveNode) metadata.TypedStatistics {
+               col := schema.NewColumn(n, 0, 0)
+               return metadata.NewStatistics(col, memory.DefaultAllocator)
+       }
+       cases := []struct {
+               name string
+               s    metadata.TypedStatistics
+       }{
+               {"Int32", mk(schema.NewInt32Node("i32", 
parquet.Repetitions.Required, -1))},
+               {"Int64", mk(schema.NewInt64Node("i64", 
parquet.Repetitions.Required, -1))},
+               {"Float32", mk(schema.NewFloat32Node("f32", 
parquet.Repetitions.Required, -1))},
+               {"Float64", mk(schema.NewFloat64Node("f64", 
parquet.Repetitions.Required, -1))},
+               {"Boolean", mk(schema.NewBooleanNode("b", 
parquet.Repetitions.Required, -1))},
+               {"ByteArray", mk(schema.NewByteArrayNode("ba", 
parquet.Repetitions.Required, -1))},
+       }
+       for _, c := range cases {
+               t.Run(c.name, func(t *testing.T) {
+                       assert.False(t, c.s.HasDistinctCount(), "fresh stats 
must not advertise a distinct count")
+                       enc, err := c.s.Encode()
+                       require.NoError(t, err)
+                       assert.False(t, enc.HasDistinctCount, "encoded stats 
must leave distinct_count unset until IncDistinct is called")
+
+                       c.s.IncDistinct(3)
+                       assert.True(t, c.s.HasDistinctCount())
+                       enc, err = c.s.Encode()
+                       require.NoError(t, err)
+                       assert.True(t, enc.HasDistinctCount)
+                       assert.Equal(t, int64(3), enc.DistinctCount)
+               })
+       }
+}
diff --git a/parquet/metadata/statistics_types.gen.go 
b/parquet/metadata/statistics_types.gen.go
index a00010f9..a9a19ea1 100644
--- a/parquet/metadata/statistics_types.gen.go
+++ b/parquet/metadata/statistics_types.gen.go
@@ -57,12 +57,11 @@ func NewInt32Statistics(descr *schema.Column, mem 
memory.Allocator) *Int32Statis
 
        return &Int32Statistics{
                statistics: statistics{
-                       descr:            descr,
-                       hasNullCount:     true,
-                       hasDistinctCount: true,
-                       order:            descr.SortOrder(),
-                       encoder:          
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, 
descr, mem),
-                       mem:              mem,
+                       descr:        descr,
+                       hasNullCount: true,
+                       order:        descr.SortOrder(),
+                       encoder:      encoding.NewEncoder(descr.PhysicalType(), 
parquet.Encodings.Plain, false, descr, mem),
+                       mem:          mem,
                },
        }
 }
@@ -358,12 +357,11 @@ func NewInt64Statistics(descr *schema.Column, mem 
memory.Allocator) *Int64Statis
 
        return &Int64Statistics{
                statistics: statistics{
-                       descr:            descr,
-                       hasNullCount:     true,
-                       hasDistinctCount: true,
-                       order:            descr.SortOrder(),
-                       encoder:          
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, 
descr, mem),
-                       mem:              mem,
+                       descr:        descr,
+                       hasNullCount: true,
+                       order:        descr.SortOrder(),
+                       encoder:      encoding.NewEncoder(descr.PhysicalType(), 
parquet.Encodings.Plain, false, descr, mem),
+                       mem:          mem,
                },
        }
 }
@@ -659,12 +657,11 @@ func NewInt96Statistics(descr *schema.Column, mem 
memory.Allocator) *Int96Statis
 
        return &Int96Statistics{
                statistics: statistics{
-                       descr:            descr,
-                       hasNullCount:     true,
-                       hasDistinctCount: true,
-                       order:            descr.SortOrder(),
-                       encoder:          
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, 
descr, mem),
-                       mem:              mem,
+                       descr:        descr,
+                       hasNullCount: true,
+                       order:        descr.SortOrder(),
+                       encoder:      encoding.NewEncoder(descr.PhysicalType(), 
parquet.Encodings.Plain, false, descr, mem),
+                       mem:          mem,
                },
        }
 }
@@ -938,12 +935,11 @@ func NewFloat32Statistics(descr *schema.Column, mem 
memory.Allocator) *Float32St
 
        return &Float32Statistics{
                statistics: statistics{
-                       descr:            descr,
-                       hasNullCount:     true,
-                       hasDistinctCount: true,
-                       order:            descr.SortOrder(),
-                       encoder:          
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, 
descr, mem),
-                       mem:              mem,
+                       descr:        descr,
+                       hasNullCount: true,
+                       order:        descr.SortOrder(),
+                       encoder:      encoding.NewEncoder(descr.PhysicalType(), 
parquet.Encodings.Plain, false, descr, mem),
+                       mem:          mem,
                },
        }
 }
@@ -1231,12 +1227,11 @@ func NewFloat64Statistics(descr *schema.Column, mem 
memory.Allocator) *Float64St
 
        return &Float64Statistics{
                statistics: statistics{
-                       descr:            descr,
-                       hasNullCount:     true,
-                       hasDistinctCount: true,
-                       order:            descr.SortOrder(),
-                       encoder:          
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, 
descr, mem),
-                       mem:              mem,
+                       descr:        descr,
+                       hasNullCount: true,
+                       order:        descr.SortOrder(),
+                       encoder:      encoding.NewEncoder(descr.PhysicalType(), 
parquet.Encodings.Plain, false, descr, mem),
+                       mem:          mem,
                },
        }
 }
@@ -1524,12 +1519,11 @@ func NewBooleanStatistics(descr *schema.Column, mem 
memory.Allocator) *BooleanSt
 
        return &BooleanStatistics{
                statistics: statistics{
-                       descr:            descr,
-                       hasNullCount:     true,
-                       hasDistinctCount: true,
-                       order:            descr.SortOrder(),
-                       encoder:          
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, 
descr, mem),
-                       mem:              mem,
+                       descr:        descr,
+                       hasNullCount: true,
+                       order:        descr.SortOrder(),
+                       encoder:      encoding.NewEncoder(descr.PhysicalType(), 
parquet.Encodings.Plain, false, descr, mem),
+                       mem:          mem,
                },
        }
 }
@@ -1879,12 +1873,11 @@ func NewByteArrayStatistics(descr *schema.Column, mem 
memory.Allocator) *ByteArr
 
        return &ByteArrayStatistics{
                statistics: statistics{
-                       descr:            descr,
-                       hasNullCount:     true,
-                       hasDistinctCount: true,
-                       order:            descr.SortOrder(),
-                       encoder:          
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, 
descr, mem),
-                       mem:              mem,
+                       descr:        descr,
+                       hasNullCount: true,
+                       order:        descr.SortOrder(),
+                       encoder:      encoding.NewEncoder(descr.PhysicalType(), 
parquet.Encodings.Plain, false, descr, mem),
+                       mem:          mem,
                },
 
                min: make([]byte, 0),
@@ -2187,12 +2180,11 @@ func NewFixedLenByteArrayStatistics(descr 
*schema.Column, mem memory.Allocator)
 
        return &FixedLenByteArrayStatistics{
                statistics: statistics{
-                       descr:            descr,
-                       hasNullCount:     true,
-                       hasDistinctCount: true,
-                       order:            descr.SortOrder(),
-                       encoder:          
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, 
descr, mem),
-                       mem:              mem,
+                       descr:        descr,
+                       hasNullCount: true,
+                       order:        descr.SortOrder(),
+                       encoder:      encoding.NewEncoder(descr.PhysicalType(), 
parquet.Encodings.Plain, false, descr, mem),
+                       mem:          mem,
                },
        }
 }
@@ -2502,12 +2494,11 @@ func NewFloat16Statistics(descr *schema.Column, mem 
memory.Allocator) *Float16St
 
        return &Float16Statistics{
                statistics: statistics{
-                       descr:            descr,
-                       hasNullCount:     true,
-                       hasDistinctCount: true,
-                       order:            descr.SortOrder(),
-                       encoder:          
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, 
descr, mem),
-                       mem:              mem,
+                       descr:        descr,
+                       hasNullCount: true,
+                       order:        descr.SortOrder(),
+                       encoder:      encoding.NewEncoder(descr.PhysicalType(), 
parquet.Encodings.Plain, false, descr, mem),
+                       mem:          mem,
                },
        }
 }
diff --git a/parquet/metadata/statistics_types.gen.go.tmpl 
b/parquet/metadata/statistics_types.gen.go.tmpl
index 83493998..76657943 100644
--- a/parquet/metadata/statistics_types.gen.go.tmpl
+++ b/parquet/metadata/statistics_types.gen.go.tmpl
@@ -62,7 +62,6 @@ func New{{.Name}}Statistics(descr *schema.Column, mem 
memory.Allocator) *{{.Name
     statistics: statistics{
       descr: descr,
       hasNullCount: true,
-      hasDistinctCount: true,
       order: descr.SortOrder(),
       encoder: encoding.NewEncoder(descr.PhysicalType(), 
parquet.Encodings.Plain, false, descr, mem),
       mem: mem,
diff --git a/parquet/pqarrow/file_writer_test.go 
b/parquet/pqarrow/file_writer_test.go
index 7b98b212..7ba4b85b 100644
--- a/parquet/pqarrow/file_writer_test.go
+++ b/parquet/pqarrow/file_writer_test.go
@@ -171,8 +171,8 @@ func TestFileWriterTotalBytes(t *testing.T) {
        require.NoError(t, writer.Close())
 
        // Verify total bytes & compressed bytes are correct
-       assert.Equal(t, int64(340), writer.TotalCompressedBytes())
-       assert.Equal(t, int64(799), writer.TotalBytesWritten())
+       assert.Equal(t, int64(332), writer.TotalCompressedBytes())
+       assert.Equal(t, int64(783), writer.TotalBytesWritten())
 }
 
 func TestFileWriterTotalBytesBuffered(t *testing.T) {
@@ -205,8 +205,8 @@ func TestFileWriterTotalBytesBuffered(t *testing.T) {
        require.NoError(t, writer.Close())
 
        // Verify total bytes & compressed bytes are correct
-       assert.Equal(t, int64(494), writer.TotalCompressedBytes())
-       assert.Equal(t, int64(1139), writer.TotalBytesWritten())
+       assert.Equal(t, int64(482), writer.TotalCompressedBytes())
+       assert.Equal(t, int64(1115), writer.TotalBytesWritten())
 }
 
 func TestWriteOnClosedFileWriter(t *testing.T) {

Reply via email to