This is an automated email from the ASF dual-hosted git repository. zeroshade pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push: new 5651f0ca fix(parquet/pqarrow): Fix null_count column stats (#489) 5651f0ca is described below commit 5651f0cae82aadb8c81fcd324cd46606797335c7 Author: Travis Patterson <travis.patter...@grafana.com> AuthorDate: Thu Sep 4 12:13:02 2025 -0600 fix(parquet/pqarrow): Fix null_count column stats (#489) ### Rationale for this change When dictionary encoding is enabled and and repetitions are set to `required`, the `null_count` statistic is negative because `defLevels` is always 0. ### What changes are included in this PR? This PR sets the null count to 0 if `defLevels - nonNullCount < 0` ### Are these changes tested? Yes. I've added a new test that exposes another bug. I started with the `fullTypeList` variable like [this test](https://github.com/apache/arrow-go/blob/c6ce2ef4e55009a786cf04b3845eba5170c98066/parquet/pqarrow/encode_dictionary_test.go#L43) and discovered that not all types are supported. If an unsupported type is encoded as a dictionary, it results in a nasty panic in the typed dictionary encoder because the types don't line up. ### Are there any user-facing changes? Yes, the stats written to parquet files are currently wrong for these sorts of columns. This PR should fix that! --- parquet/pqarrow/encode_arrow_test.go | 41 +++++++++++++++++++++++++++++ parquet/pqarrow/encode_dict_compute.go | 4 ++- parquet/pqarrow/encode_dictionary_test.go | 43 +++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/parquet/pqarrow/encode_arrow_test.go b/parquet/pqarrow/encode_arrow_test.go index 61bc263f..a279f931 100644 --- a/parquet/pqarrow/encode_arrow_test.go +++ b/parquet/pqarrow/encode_arrow_test.go @@ -1284,6 +1284,30 @@ func (ps *ParquetIOTestSuite) writeColumn(mem memory.Allocator, sc *schema.Group return buf.Bytes() } +func (ps *ParquetIOTestSuite) writeDictionaryColumn(mem memory.Allocator, sc *schema.GroupNode, values arrow.Array) []byte { + var buf bytes.Buffer + arrsc, err := pqarrow.FromParquet(schema.NewSchema(sc), nil, nil) + ps.NoError(err) + + writer, err := pqarrow.NewFileWriter( + arrsc, + &buf, + parquet.NewWriterProperties( + parquet.WithDictionaryDefault(true), + parquet.WithStats(true), + ), + pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)), + ) + ps.NoError(err) + + writer.NewRowGroup() + ps.NoError(writer.WriteColumnData(values)) + ps.NoError(writer.Close()) + ps.NoError(writer.Close()) + + return buf.Bytes() +} + func (ps *ParquetIOTestSuite) readAndCheckSingleColumnFile(mem memory.Allocator, data []byte, values arrow.Array) { reader := ps.createReader(mem, data) cr, err := reader.GetColumn(context.TODO(), 0) @@ -1327,6 +1351,23 @@ var fullTypeList = []arrow.DataType{ &arrow.Decimal128Type{Precision: 38, Scale: 37}, } +var dictEncodingSupportedTypeList = []arrow.DataType{ + arrow.PrimitiveTypes.Int32, + arrow.PrimitiveTypes.Int64, + arrow.PrimitiveTypes.Float32, + arrow.PrimitiveTypes.Float64, + arrow.BinaryTypes.String, + arrow.BinaryTypes.Binary, + &arrow.FixedSizeBinaryType{ByteWidth: 10}, + &arrow.Decimal128Type{Precision: 1, Scale: 0}, + &arrow.Decimal128Type{Precision: 5, Scale: 4}, + &arrow.Decimal128Type{Precision: 10, Scale: 9}, + &arrow.Decimal128Type{Precision: 19, Scale: 18}, + &arrow.Decimal128Type{Precision: 23, Scale: 22}, + &arrow.Decimal128Type{Precision: 27, Scale: 26}, + &arrow.Decimal128Type{Precision: 38, Scale: 37}, +} + func (ps *ParquetIOTestSuite) TestSingleColumnRequiredWrite() { for _, dt := range fullTypeList { ps.Run(dt.Name(), func() { diff --git a/parquet/pqarrow/encode_dict_compute.go b/parquet/pqarrow/encode_dict_compute.go index 707a6f4b..9f76cbfb 100644 --- a/parquet/pqarrow/encode_dict_compute.go +++ b/parquet/pqarrow/encode_dict_compute.go @@ -117,7 +117,9 @@ func writeDictionaryArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, lea } nonNullCount := indices.Len() - indices.NullN() - pageStats.IncNulls(int64(len(defLevels) - nonNullCount)) + nullCount := max(int64(len(defLevels)-nonNullCount), 0) + + pageStats.IncNulls(nullCount) pageStats.IncNumValues(int64(nonNullCount)) return pageStats.UpdateFromArrow(referencedDict, false) } diff --git a/parquet/pqarrow/encode_dictionary_test.go b/parquet/pqarrow/encode_dictionary_test.go index 25625f4d..ac6d4e50 100644 --- a/parquet/pqarrow/encode_dictionary_test.go +++ b/parquet/pqarrow/encode_dictionary_test.go @@ -67,6 +67,49 @@ func (ps *ParquetIOTestSuite) TestSingleColumnOptionalDictionaryWrite() { } } +func (ps *ParquetIOTestSuite) TestSingleColumnRequiredDictionaryWrite() { + for _, dt := range dictEncodingSupportedTypeList { + // skip tests for bool as we don't do dictionaries for it + if dt.ID() == arrow.BOOL { + continue + } + + ps.Run(dt.Name(), func() { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(ps.T(), 0) + + bldr := array.NewDictionaryBuilder(mem, &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int16, ValueType: dt}) + defer bldr.Release() + + values := testutils.RandomNonNull(mem, dt, smallSize) + defer values.Release() + ps.Require().NoError(bldr.AppendArray(values)) + + arr := bldr.NewDictionaryArray() + defer arr.Release() + + sc := ps.makeSimpleSchema(arr.DataType(), parquet.Repetitions.Required) + data := ps.writeDictionaryColumn(mem, sc, arr) + + rdr, err := file.NewParquetReader(bytes.NewReader(data)) + ps.NoError(err) + defer rdr.Close() + + metadata := rdr.MetaData() + ps.Len(metadata.RowGroups, 1) + + rg := metadata.RowGroup(0) + col, err := rg.ColumnChunk(0) + ps.NoError(err) + + stats, err := col.Statistics() + ps.NoError(err) + ps.EqualValues(smallSize, stats.NumValues()) + ps.EqualValues(0, stats.NullCount()) + }) + } +} + func TestPqarrowDictionaries(t *testing.T) { suite.Run(t, &ArrowWriteDictionarySuite{dataPageVersion: parquet.DataPageV1}) suite.Run(t, &ArrowWriteDictionarySuite{dataPageVersion: parquet.DataPageV2})