This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new 4be5561 fix(parquet/pqarrow): fix definition levels with non-nullable
lists (#325)
4be5561 is described below
commit 4be55614ac122d0c9070bb3b2febd4fe7c6c55e8
Author: Matt Topol <[email protected]>
AuthorDate: Mon Mar 24 09:15:01 2025 -0400
fix(parquet/pqarrow): fix definition levels with non-nullable lists (#325)
### Rationale for this change
Related to https://github.com/apache/arrow/issues/38503, it was found in
https://github.com/apache/iceberg-go/issues/357 that the Parquet file
being written by pqarrow was incompatible with pyarrow in some testing.
After some digging I determined the cause. So we should fix this to
finally put #38503 to bed
### What changes are included in this PR?
Fixing the computation of definition levels when writing from Arrow data
with lists of non-nullable elements. Previously we were basing the
`nullableInParent` on whether it was a list or a map, assuming that the
element was *always* nullable in a list. This, of course, is incorrect
and we need to actually *check* the nullability of the list element to
compute the correct definition level. This way we don't produce
incongruous levels that are larger than what the max definition level
should be.
### Are these changes tested?
Yes, I've updated the corresponding test, which wasn't *actually*
testing a non-nullable element until now.
### Are there any user-facing changes?
Only fixing writing of files.
---
parquet/file/column_writer.go | 10 ++++++++--
parquet/pqarrow/path_builder.go | 2 +-
parquet/pqarrow/path_builder_test.go | 5 +++--
3 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/parquet/file/column_writer.go b/parquet/file/column_writer.go
index f608cd0..d087533 100644
--- a/parquet/file/column_writer.go
+++ b/parquet/file/column_writer.go
@@ -20,12 +20,14 @@ import (
"bytes"
"encoding/binary"
"io"
+ "strconv"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/apache/arrow-go/v18/arrow/bitutil"
"github.com/apache/arrow-go/v18/arrow/memory"
"github.com/apache/arrow-go/v18/parquet"
+ "github.com/apache/arrow-go/v18/parquet/internal/debug"
"github.com/apache/arrow-go/v18/parquet/internal/encoding"
"github.com/apache/arrow-go/v18/parquet/metadata"
"github.com/apache/arrow-go/v18/parquet/schema"
@@ -428,10 +430,14 @@ func (w *columnWriter) FlushBufferedDataPages() (err
error) {
func (w *columnWriter) writeLevels(numValues int64, defLevels, repLevels
[]int16) int64 {
toWrite := int64(0)
+ maxDefLevel := w.descr.MaxDefinitionLevel()
+
// if the field is required and non-repeated, no definition levels
- if defLevels != nil && w.descr.MaxDefinitionLevel() > 0 {
+ if defLevels != nil && maxDefLevel > 0 {
for _, v := range defLevels[:numValues] {
- if v == w.descr.MaxDefinitionLevel() {
+ debug.Assert(v <= maxDefLevel, "columnwriter: invalid
definition level "+
+ strconv.Itoa(int(v))+" for column
"+w.descr.Path())
+ if v == maxDefLevel {
toWrite++
}
}
diff --git a/parquet/pqarrow/path_builder.go b/parquet/pqarrow/path_builder.go
index 92b1256..7abd0f5 100644
--- a/parquet/pqarrow/path_builder.go
+++ b/parquet/pqarrow/path_builder.go
@@ -395,7 +395,7 @@ func (p *pathBuilder) Visit(arr arrow.Array) error {
repLevel: p.info.maxRepLevel,
defLevelIfEmpty: p.info.maxDefLevel - 1,
})
- p.nullableInParent = ok
+ p.nullableInParent =
arr.DataType().(arrow.ListLikeType).ElemField().Nullable
return p.Visit(larr.ListValues())
case arrow.FIXED_SIZE_LIST:
p.maybeAddNullable(arr)
diff --git a/parquet/pqarrow/path_builder_test.go
b/parquet/pqarrow/path_builder_test.go
index bb9d8bf..df548bc 100644
--- a/parquet/pqarrow/path_builder_test.go
+++ b/parquet/pqarrow/path_builder_test.go
@@ -39,7 +39,8 @@ func TestNonNullableSingleList(t *testing.T) {
// So:
// def level 0: a null entry
// def level 1: a non-null entry
- bldr := array.NewListBuilder(memory.DefaultAllocator,
arrow.PrimitiveTypes.Int64)
+ bldr := array.NewBuilder(memory.DefaultAllocator,
+
arrow.ListOfNonNullable(arrow.PrimitiveTypes.Int64)).(*array.ListBuilder)
defer bldr.Release()
vb := bldr.ValueBuilder().(*array.Int64Builder)
@@ -67,7 +68,7 @@ func TestNonNullableSingleList(t *testing.T) {
result, err := mp.write(0, ctx)
require.NoError(t, err)
- assert.Equal(t, []int16{2, 2, 2, 2, 2, 2}, result.defLevels)
+ assert.Equal(t, []int16{1, 1, 1, 1, 1, 1}, result.defLevels)
assert.Equal(t, []int16{0, 0, 1, 0, 1, 1}, result.repLevels)
assert.Len(t, result.postListVisitedElems, 1)
assert.EqualValues(t, 0, result.postListVisitedElems[0].start)