This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new 14eb8f3 fix(parquet/pqarrow): Fix propagation of field-ids for Lists
(#397)
14eb8f3 is described below
commit 14eb8f3609ffec35c5e14e1d557929a3b8c0bb44
Author: Matt Topol <[email protected]>
AuthorDate: Fri Jun 6 10:33:30 2025 -0400
fix(parquet/pqarrow): Fix propagation of field-ids for Lists (#397)
### Rationale for this change
An issue was found in apache/iceberg-go when attempting to retrieve data
from a table containing a List Column that had a struct as the element.
It was failing to propagate the element-id for the fields when fetching.
I tracked it down to the schema handling here.
### What changes are included in this PR?
Changes the `getNestedFactory` method in pqarrow/schema.go to use
`ListOfField` instead of `ListOf` so that it preserves the metadata,
i.e. the field id.
### Are these changes tested?
Yes, a test has been added to cover this scenario.
### Are there any user-facing changes?
Previously this situation would result in a field-id of -1, now users
will see the field-id get propagated correctly.
---
parquet/pqarrow/encode_dictionary_test.go | 4 +++-
parquet/pqarrow/schema.go | 6 +++---
parquet/pqarrow/schema_test.go | 30 ++++++++++++++++++++++++++++++
3 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/parquet/pqarrow/encode_dictionary_test.go
b/parquet/pqarrow/encode_dictionary_test.go
index d1bf60d..25625f4 100644
--- a/parquet/pqarrow/encode_dictionary_test.go
+++ b/parquet/pqarrow/encode_dictionary_test.go
@@ -688,7 +688,9 @@ func TestArrowWriteNestedSubfieldDictionary(t *testing.T) {
dictValues := array.NewDictionaryArray(dictType, indices, dict)
defer dictValues.Release()
- data := array.NewData(arrow.ListOf(dictType), 3, []*memory.Buffer{nil,
offsets.Data().Buffers()[1]},
+ data := array.NewData(arrow.ListOfField(arrow.Field{
+ Name: "element", Type: dictType, Nullable: true,
+ Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"},
[]string{"-1"})}), 3, []*memory.Buffer{nil, offsets.Data().Buffers()[1]},
[]arrow.ArrayData{dictValues.Data()}, 0, 0)
defer data.Release()
values := array.NewListData(data)
diff --git a/parquet/pqarrow/schema.go b/parquet/pqarrow/schema.go
index d8364a0..34e4cc6 100644
--- a/parquet/pqarrow/schema.go
+++ b/parquet/pqarrow/schema.go
@@ -1015,19 +1015,19 @@ func getNestedFactory(origin, inferred arrow.DataType)
func(fieldList []arrow.Fi
switch origin.ID() {
case arrow.LIST:
return func(list []arrow.Field) arrow.DataType {
- return arrow.ListOf(list[0].Type)
+ return arrow.ListOfField(list[0])
}
case arrow.FIXED_SIZE_LIST:
sz := origin.(*arrow.FixedSizeListType).Len()
return func(list []arrow.Field) arrow.DataType {
- return arrow.FixedSizeListOf(sz, list[0].Type)
+ return arrow.FixedSizeListOfField(sz, list[0])
}
}
case arrow.MAP:
if origin.ID() == arrow.MAP {
return func(list []arrow.Field) arrow.DataType {
valType := list[0].Type.(*arrow.StructType)
- return arrow.MapOf(valType.Field(0).Type,
valType.Field(1).Type)
+ return arrow.MapOfFields(valType.Field(0),
valType.Field(1))
}
}
}
diff --git a/parquet/pqarrow/schema_test.go b/parquet/pqarrow/schema_test.go
index ef03ae4..58475dc 100644
--- a/parquet/pqarrow/schema_test.go
+++ b/parquet/pqarrow/schema_test.go
@@ -473,6 +473,36 @@ func TestProperListElementNullability(t *testing.T) {
assert.True(t, arrSchema.Equal(outSchema), "expected: %s, got: %s",
arrSchema, outSchema)
}
+func TestFieldNestedPropagate(t *testing.T) {
+ arrSchema := arrow.NewSchema([]arrow.Field{
+ {Name: "transformations", Type: arrow.ListOfField(
+ arrow.Field{
+ Name: "element",
+ Type: arrow.StructOf(
+ arrow.Field{Name: "destination", Type:
arrow.BinaryTypes.String,
+ Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"6"})},
+ arrow.Field{Name: "transform_type",
Type: arrow.BinaryTypes.String,
+ Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"7"})},
+ arrow.Field{Name: "transform_value",
Type: arrow.BinaryTypes.String,
+ Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"8"})},
+ arrow.Field{Name: "source_cols", Type:
arrow.ListOfField(
+ arrow.Field{Name: "element",
Type: arrow.BinaryTypes.String, Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"10"})}),
+ Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"9"})},
+ ),
+ Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"5"}),
+ },
+ ), Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"},
[]string{"4"})},
+ }, nil)
+
+ pqSchema, err := pqarrow.ToParquet(arrSchema, nil,
pqarrow.DefaultWriterProps())
+ require.NoError(t, err)
+
+ result, err := pqarrow.FromParquet(pqSchema, nil,
metadata.KeyValueMetadata{})
+ require.NoError(t, err)
+
+ assert.True(t, arrSchema.Equal(result), "expected: %s, got: %s",
arrSchema, result)
+}
+
func TestConvertSchemaParquetVariant(t *testing.T) {
// unshredded variant:
// optional group variant_col {