This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new 4be5561  fix(parquet/pqarrow): fix definition levels with non-nullable 
lists (#325)
4be5561 is described below

commit 4be55614ac122d0c9070bb3b2febd4fe7c6c55e8
Author: Matt Topol <[email protected]>
AuthorDate: Mon Mar 24 09:15:01 2025 -0400

    fix(parquet/pqarrow): fix definition levels with non-nullable lists (#325)
    
    ### Rationale for this change
    Related to https://github.com/apache/arrow/issues/38503, it was found in
    https://github.com/apache/iceberg-go/issues/357 that the Parquet file
    being written by pqarrow was incompatible with pyarrow in some testing.
    After some digging I determined the cause. So we should fix this to
    finally put #38503 to bed
    
    ### What changes are included in this PR?
    Fixing the computation of definition levels when writing from Arrow data
    with lists of non-nullable elements. Previously we were basing the
    `nullableInParent` on whether it was a list or a map, assuming that the
    element was *always* nullable in a list. This, of course, is incorrect
    and we need to actually *check* the nullability of the list element to
    compute the correct definition level. This way we don't produce
    incongruous levels that are larger than what the max definition level
    should be.
    
    ### Are these changes tested?
    Yes, I've updated the corresponding test, which wasn't *actually*
    testing a non-nullable element until now.
    
    ### Are there any user-facing changes?
    Only fixing writing of files.
---
 parquet/file/column_writer.go        | 10 ++++++++--
 parquet/pqarrow/path_builder.go      |  2 +-
 parquet/pqarrow/path_builder_test.go |  5 +++--
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/parquet/file/column_writer.go b/parquet/file/column_writer.go
index f608cd0..d087533 100644
--- a/parquet/file/column_writer.go
+++ b/parquet/file/column_writer.go
@@ -20,12 +20,14 @@ import (
        "bytes"
        "encoding/binary"
        "io"
+       "strconv"
 
        "github.com/apache/arrow-go/v18/arrow"
        "github.com/apache/arrow-go/v18/arrow/array"
        "github.com/apache/arrow-go/v18/arrow/bitutil"
        "github.com/apache/arrow-go/v18/arrow/memory"
        "github.com/apache/arrow-go/v18/parquet"
+       "github.com/apache/arrow-go/v18/parquet/internal/debug"
        "github.com/apache/arrow-go/v18/parquet/internal/encoding"
        "github.com/apache/arrow-go/v18/parquet/metadata"
        "github.com/apache/arrow-go/v18/parquet/schema"
@@ -428,10 +430,14 @@ func (w *columnWriter) FlushBufferedDataPages() (err 
error) {
 
 func (w *columnWriter) writeLevels(numValues int64, defLevels, repLevels 
[]int16) int64 {
        toWrite := int64(0)
+       maxDefLevel := w.descr.MaxDefinitionLevel()
+
        // if the field is required and non-repeated, no definition levels
-       if defLevels != nil && w.descr.MaxDefinitionLevel() > 0 {
+       if defLevels != nil && maxDefLevel > 0 {
                for _, v := range defLevels[:numValues] {
-                       if v == w.descr.MaxDefinitionLevel() {
+                       debug.Assert(v <= maxDefLevel, "columnwriter: invalid 
definition level "+
+                               strconv.Itoa(int(v))+" for column 
"+w.descr.Path())
+                       if v == maxDefLevel {
                                toWrite++
                        }
                }
diff --git a/parquet/pqarrow/path_builder.go b/parquet/pqarrow/path_builder.go
index 92b1256..7abd0f5 100644
--- a/parquet/pqarrow/path_builder.go
+++ b/parquet/pqarrow/path_builder.go
@@ -395,7 +395,7 @@ func (p *pathBuilder) Visit(arr arrow.Array) error {
                        repLevel:        p.info.maxRepLevel,
                        defLevelIfEmpty: p.info.maxDefLevel - 1,
                })
-               p.nullableInParent = ok
+               p.nullableInParent = 
arr.DataType().(arrow.ListLikeType).ElemField().Nullable
                return p.Visit(larr.ListValues())
        case arrow.FIXED_SIZE_LIST:
                p.maybeAddNullable(arr)
diff --git a/parquet/pqarrow/path_builder_test.go 
b/parquet/pqarrow/path_builder_test.go
index bb9d8bf..df548bc 100644
--- a/parquet/pqarrow/path_builder_test.go
+++ b/parquet/pqarrow/path_builder_test.go
@@ -39,7 +39,8 @@ func TestNonNullableSingleList(t *testing.T) {
        // So:
        // def level 0: a null entry
        // def level 1: a non-null entry
-       bldr := array.NewListBuilder(memory.DefaultAllocator, 
arrow.PrimitiveTypes.Int64)
+       bldr := array.NewBuilder(memory.DefaultAllocator,
+               
arrow.ListOfNonNullable(arrow.PrimitiveTypes.Int64)).(*array.ListBuilder)
        defer bldr.Release()
 
        vb := bldr.ValueBuilder().(*array.Int64Builder)
@@ -67,7 +68,7 @@ func TestNonNullableSingleList(t *testing.T) {
        result, err := mp.write(0, ctx)
        require.NoError(t, err)
 
-       assert.Equal(t, []int16{2, 2, 2, 2, 2, 2}, result.defLevels)
+       assert.Equal(t, []int16{1, 1, 1, 1, 1, 1}, result.defLevels)
        assert.Equal(t, []int16{0, 0, 1, 0, 1, 1}, result.repLevels)
        assert.Len(t, result.postListVisitedElems, 1)
        assert.EqualValues(t, 0, result.postListVisitedElems[0].start)

Reply via email to