This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new 14eb8f3  fix(parquet/pqarrow): Fix propagation of field-ids for Lists 
(#397)
14eb8f3 is described below

commit 14eb8f3609ffec35c5e14e1d557929a3b8c0bb44
Author: Matt Topol <[email protected]>
AuthorDate: Fri Jun 6 10:33:30 2025 -0400

    fix(parquet/pqarrow): Fix propagation of field-ids for Lists (#397)
    
    ### Rationale for this change
    An issue was found in apache/iceberg-go when attempting to retrieve data
    from a table containing a List Column that had a struct as the element.
    It was failing to propagate the element-id for the fields when fetching.
    I tracked it down to the schema handling here.
    
    ### What changes are included in this PR?
    Changes the `getNestedFactory` method in pqarrow/schema.go to use
    `ListOfField` instead of `ListOf` so that it preserves the metadata,
    i.e. the field id.
    
    ### Are these changes tested?
    Yes, a test has been added to cover this scenario.
    
    ### Are there any user-facing changes?
    Previously this situation would result in a field-id of -1, now users
    will see the field-id get propagated correctly.
---
 parquet/pqarrow/encode_dictionary_test.go |  4 +++-
 parquet/pqarrow/schema.go                 |  6 +++---
 parquet/pqarrow/schema_test.go            | 30 ++++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/parquet/pqarrow/encode_dictionary_test.go 
b/parquet/pqarrow/encode_dictionary_test.go
index d1bf60d..25625f4 100644
--- a/parquet/pqarrow/encode_dictionary_test.go
+++ b/parquet/pqarrow/encode_dictionary_test.go
@@ -688,7 +688,9 @@ func TestArrowWriteNestedSubfieldDictionary(t *testing.T) {
        dictValues := array.NewDictionaryArray(dictType, indices, dict)
        defer dictValues.Release()
 
-       data := array.NewData(arrow.ListOf(dictType), 3, []*memory.Buffer{nil, 
offsets.Data().Buffers()[1]},
+       data := array.NewData(arrow.ListOfField(arrow.Field{
+               Name: "element", Type: dictType, Nullable: true,
+               Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, 
[]string{"-1"})}), 3, []*memory.Buffer{nil, offsets.Data().Buffers()[1]},
                []arrow.ArrayData{dictValues.Data()}, 0, 0)
        defer data.Release()
        values := array.NewListData(data)
diff --git a/parquet/pqarrow/schema.go b/parquet/pqarrow/schema.go
index d8364a0..34e4cc6 100644
--- a/parquet/pqarrow/schema.go
+++ b/parquet/pqarrow/schema.go
@@ -1015,19 +1015,19 @@ func getNestedFactory(origin, inferred arrow.DataType) 
func(fieldList []arrow.Fi
                switch origin.ID() {
                case arrow.LIST:
                        return func(list []arrow.Field) arrow.DataType {
-                               return arrow.ListOf(list[0].Type)
+                               return arrow.ListOfField(list[0])
                        }
                case arrow.FIXED_SIZE_LIST:
                        sz := origin.(*arrow.FixedSizeListType).Len()
                        return func(list []arrow.Field) arrow.DataType {
-                               return arrow.FixedSizeListOf(sz, list[0].Type)
+                               return arrow.FixedSizeListOfField(sz, list[0])
                        }
                }
        case arrow.MAP:
                if origin.ID() == arrow.MAP {
                        return func(list []arrow.Field) arrow.DataType {
                                valType := list[0].Type.(*arrow.StructType)
-                               return arrow.MapOf(valType.Field(0).Type, 
valType.Field(1).Type)
+                               return arrow.MapOfFields(valType.Field(0), 
valType.Field(1))
                        }
                }
        }
diff --git a/parquet/pqarrow/schema_test.go b/parquet/pqarrow/schema_test.go
index ef03ae4..58475dc 100644
--- a/parquet/pqarrow/schema_test.go
+++ b/parquet/pqarrow/schema_test.go
@@ -473,6 +473,36 @@ func TestProperListElementNullability(t *testing.T) {
        assert.True(t, arrSchema.Equal(outSchema), "expected: %s, got: %s", 
arrSchema, outSchema)
 }
 
+func TestFieldNestedPropagate(t *testing.T) {
+       arrSchema := arrow.NewSchema([]arrow.Field{
+               {Name: "transformations", Type: arrow.ListOfField(
+                       arrow.Field{
+                               Name: "element",
+                               Type: arrow.StructOf(
+                                       arrow.Field{Name: "destination", Type: 
arrow.BinaryTypes.String,
+                                               Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"6"})},
+                                       arrow.Field{Name: "transform_type", 
Type: arrow.BinaryTypes.String,
+                                               Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"7"})},
+                                       arrow.Field{Name: "transform_value", 
Type: arrow.BinaryTypes.String,
+                                               Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"8"})},
+                                       arrow.Field{Name: "source_cols", Type: 
arrow.ListOfField(
+                                               arrow.Field{Name: "element", 
Type: arrow.BinaryTypes.String, Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"10"})}),
+                                               Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"9"})},
+                               ),
+                               Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"5"}),
+                       },
+               ), Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, 
[]string{"4"})},
+       }, nil)
+
+       pqSchema, err := pqarrow.ToParquet(arrSchema, nil, 
pqarrow.DefaultWriterProps())
+       require.NoError(t, err)
+
+       result, err := pqarrow.FromParquet(pqSchema, nil, 
metadata.KeyValueMetadata{})
+       require.NoError(t, err)
+
+       assert.True(t, arrSchema.Equal(result), "expected: %s, got: %s", 
arrSchema, result)
+}
+
 func TestConvertSchemaParquetVariant(t *testing.T) {
        // unshredded variant:
        // optional group variant_col {

Reply via email to