This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new 5651f0ca fix(parquet/pqarrow): Fix null_count column stats (#489)
5651f0ca is described below

commit 5651f0cae82aadb8c81fcd324cd46606797335c7
Author: Travis Patterson <travis.patter...@grafana.com>
AuthorDate: Thu Sep 4 12:13:02 2025 -0600

    fix(parquet/pqarrow): Fix null_count column stats (#489)
    
    ### Rationale for this change
    When dictionary encoding is enabled and and repetitions are set to
    `required`, the `null_count` statistic is negative because `defLevels`
    is always 0.
    
    ### What changes are included in this PR?
    This PR sets the null count to 0 if `defLevels - nonNullCount < 0`
    
    ### Are these changes tested?
    Yes. I've added a new test that exposes another bug. I started with the
    `fullTypeList` variable like [this
    
test](https://github.com/apache/arrow-go/blob/c6ce2ef4e55009a786cf04b3845eba5170c98066/parquet/pqarrow/encode_dictionary_test.go#L43)
    and discovered that not all types are supported.
    
    If an unsupported type is encoded as a dictionary, it results in a nasty
    panic in the typed dictionary encoder because the types don't line up.
    
    ### Are there any user-facing changes?
    Yes, the stats written to parquet files are currently wrong for these
    sorts of columns. This PR should fix that!
---
 parquet/pqarrow/encode_arrow_test.go      | 41 +++++++++++++++++++++++++++++
 parquet/pqarrow/encode_dict_compute.go    |  4 ++-
 parquet/pqarrow/encode_dictionary_test.go | 43 +++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/parquet/pqarrow/encode_arrow_test.go 
b/parquet/pqarrow/encode_arrow_test.go
index 61bc263f..a279f931 100644
--- a/parquet/pqarrow/encode_arrow_test.go
+++ b/parquet/pqarrow/encode_arrow_test.go
@@ -1284,6 +1284,30 @@ func (ps *ParquetIOTestSuite) writeColumn(mem 
memory.Allocator, sc *schema.Group
        return buf.Bytes()
 }
 
+func (ps *ParquetIOTestSuite) writeDictionaryColumn(mem memory.Allocator, sc 
*schema.GroupNode, values arrow.Array) []byte {
+       var buf bytes.Buffer
+       arrsc, err := pqarrow.FromParquet(schema.NewSchema(sc), nil, nil)
+       ps.NoError(err)
+
+       writer, err := pqarrow.NewFileWriter(
+               arrsc,
+               &buf,
+               parquet.NewWriterProperties(
+                       parquet.WithDictionaryDefault(true),
+                       parquet.WithStats(true),
+               ),
+               pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)),
+       )
+       ps.NoError(err)
+
+       writer.NewRowGroup()
+       ps.NoError(writer.WriteColumnData(values))
+       ps.NoError(writer.Close())
+       ps.NoError(writer.Close())
+
+       return buf.Bytes()
+}
+
 func (ps *ParquetIOTestSuite) readAndCheckSingleColumnFile(mem 
memory.Allocator, data []byte, values arrow.Array) {
        reader := ps.createReader(mem, data)
        cr, err := reader.GetColumn(context.TODO(), 0)
@@ -1327,6 +1351,23 @@ var fullTypeList = []arrow.DataType{
        &arrow.Decimal128Type{Precision: 38, Scale: 37},
 }
 
+var dictEncodingSupportedTypeList = []arrow.DataType{
+       arrow.PrimitiveTypes.Int32,
+       arrow.PrimitiveTypes.Int64,
+       arrow.PrimitiveTypes.Float32,
+       arrow.PrimitiveTypes.Float64,
+       arrow.BinaryTypes.String,
+       arrow.BinaryTypes.Binary,
+       &arrow.FixedSizeBinaryType{ByteWidth: 10},
+       &arrow.Decimal128Type{Precision: 1, Scale: 0},
+       &arrow.Decimal128Type{Precision: 5, Scale: 4},
+       &arrow.Decimal128Type{Precision: 10, Scale: 9},
+       &arrow.Decimal128Type{Precision: 19, Scale: 18},
+       &arrow.Decimal128Type{Precision: 23, Scale: 22},
+       &arrow.Decimal128Type{Precision: 27, Scale: 26},
+       &arrow.Decimal128Type{Precision: 38, Scale: 37},
+}
+
 func (ps *ParquetIOTestSuite) TestSingleColumnRequiredWrite() {
        for _, dt := range fullTypeList {
                ps.Run(dt.Name(), func() {
diff --git a/parquet/pqarrow/encode_dict_compute.go 
b/parquet/pqarrow/encode_dict_compute.go
index 707a6f4b..9f76cbfb 100644
--- a/parquet/pqarrow/encode_dict_compute.go
+++ b/parquet/pqarrow/encode_dict_compute.go
@@ -117,7 +117,9 @@ func writeDictionaryArrow(ctx *arrowWriteContext, cw 
file.ColumnChunkWriter, lea
                }
 
                nonNullCount := indices.Len() - indices.NullN()
-               pageStats.IncNulls(int64(len(defLevels) - nonNullCount))
+               nullCount := max(int64(len(defLevels)-nonNullCount), 0)
+
+               pageStats.IncNulls(nullCount)
                pageStats.IncNumValues(int64(nonNullCount))
                return pageStats.UpdateFromArrow(referencedDict, false)
        }
diff --git a/parquet/pqarrow/encode_dictionary_test.go 
b/parquet/pqarrow/encode_dictionary_test.go
index 25625f4d..ac6d4e50 100644
--- a/parquet/pqarrow/encode_dictionary_test.go
+++ b/parquet/pqarrow/encode_dictionary_test.go
@@ -67,6 +67,49 @@ func (ps *ParquetIOTestSuite) 
TestSingleColumnOptionalDictionaryWrite() {
        }
 }
 
+func (ps *ParquetIOTestSuite) TestSingleColumnRequiredDictionaryWrite() {
+       for _, dt := range dictEncodingSupportedTypeList {
+               // skip tests for bool as we don't do dictionaries for it
+               if dt.ID() == arrow.BOOL {
+                       continue
+               }
+
+               ps.Run(dt.Name(), func() {
+                       mem := 
memory.NewCheckedAllocator(memory.DefaultAllocator)
+                       defer mem.AssertSize(ps.T(), 0)
+
+                       bldr := array.NewDictionaryBuilder(mem, 
&arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int16, ValueType: dt})
+                       defer bldr.Release()
+
+                       values := testutils.RandomNonNull(mem, dt, smallSize)
+                       defer values.Release()
+                       ps.Require().NoError(bldr.AppendArray(values))
+
+                       arr := bldr.NewDictionaryArray()
+                       defer arr.Release()
+
+                       sc := ps.makeSimpleSchema(arr.DataType(), 
parquet.Repetitions.Required)
+                       data := ps.writeDictionaryColumn(mem, sc, arr)
+
+                       rdr, err := file.NewParquetReader(bytes.NewReader(data))
+                       ps.NoError(err)
+                       defer rdr.Close()
+
+                       metadata := rdr.MetaData()
+                       ps.Len(metadata.RowGroups, 1)
+
+                       rg := metadata.RowGroup(0)
+                       col, err := rg.ColumnChunk(0)
+                       ps.NoError(err)
+
+                       stats, err := col.Statistics()
+                       ps.NoError(err)
+                       ps.EqualValues(smallSize, stats.NumValues())
+                       ps.EqualValues(0, stats.NullCount())
+               })
+       }
+}
+
 func TestPqarrowDictionaries(t *testing.T) {
        suite.Run(t, &ArrowWriteDictionarySuite{dataPageVersion: 
parquet.DataPageV1})
        suite.Run(t, &ArrowWriteDictionarySuite{dataPageVersion: 
parquet.DataPageV2})

Reply via email to