This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new ca6e0eb  fix(parquet/pqarrow): propagate field id metadata for 
lists/maps (#293)
ca6e0eb is described below

commit ca6e0eb477c57a16edd787f6eba4c097a11d9c7a
Author: Matt Topol <[email protected]>
AuthorDate: Fri Feb 21 12:50:54 2025 -0500

    fix(parquet/pqarrow): propagate field id metadata for lists/maps (#293)
    
    ### Rationale for this change
    Discovered while fixing https://github.com/apache/iceberg-go/issues/309
    we didn't correctly propagate the field-id metadata to children of List
    or Map fields, only structs.
    
    ### What changes are included in this PR?
    A new MapType creator for constructing MapTypes from arrow fields for
    the Key and Items for easier construction, and fixing the `pqarrow`
    schema manifest creation to correctly propagate the child metadata field
    IDs for the children.
    
    ### Are these changes tested?
    Unit test is added.
    
    ### Are there any user-facing changes?
    Usage of pqarrow reading List/Map typed fields will now correctly
    contain the `PARQUET:field_id` metadata key in the schema produced.
---
 arrow/array/util.go                  |  2 +-
 arrow/datatype_nested.go             | 17 +++++++++++++++++
 arrow/flight/server_example_test.go  |  3 ---
 parquet/pqarrow/encode_arrow_test.go |  4 +++-
 parquet/pqarrow/schema.go            |  7 +++++--
 parquet/pqarrow/schema_test.go       | 37 +++++++++++++++++++-----------------
 6 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/arrow/array/util.go b/arrow/array/util.go
index e21290f..53a1fdd 100644
--- a/arrow/array/util.go
+++ b/arrow/array/util.go
@@ -504,7 +504,7 @@ func (n *nullArrayFactory) create() *Data {
        return out
 }
 
-func (n *nullArrayFactory) createChild(dt arrow.DataType, i, length int) *Data 
{
+func (n *nullArrayFactory) createChild(_ arrow.DataType, i, length int) *Data {
        childFactory := &nullArrayFactory{
                mem: n.mem, dt: n.dt.(arrow.NestedType).Fields()[i].Type,
                len: length, buf: n.buf}
diff --git a/arrow/datatype_nested.go b/arrow/datatype_nested.go
index 32c3ed0..6664c50 100644
--- a/arrow/datatype_nested.go
+++ b/arrow/datatype_nested.go
@@ -536,6 +536,23 @@ func MapOf(key, item DataType) *MapType {
        return &MapType{value: ListOf(StructOf(Field{Name: "key", Type: key}, 
Field{Name: "value", Type: item, Nullable: true}))}
 }
 
+func MapOfFields(key, item Field) *MapType {
+       if key.Type == nil || item.Type == nil {
+               panic("arrow: nil key or item type for MapType")
+       }
+
+       if key.Nullable {
+               panic("arrow: key field must be non-nullable")
+       }
+
+       key.Name = "key"
+       item.Name = "value"
+       return &MapType{value: ListOfField(Field{
+               Name: "entries",
+               Type: StructOf(key, item),
+       })}
+}
+
 func MapOfWithMetadata(key DataType, keyMetadata Metadata, item DataType, 
itemMetadata Metadata) *MapType {
        if key == nil || item == nil {
                panic("arrow: nil key or item type for MapType")
diff --git a/arrow/flight/server_example_test.go 
b/arrow/flight/server_example_test.go
index e41482c..2c54a66 100644
--- a/arrow/flight/server_example_test.go
+++ b/arrow/flight/server_example_test.go
@@ -64,9 +64,6 @@ func ExampleRegisterFlightServiceServer() {
 
        fmt.Println(rsp.Status)
        fc := flight.NewClientFromConn(conn, nil)
-       if err != nil {
-               panic(err)
-       }
 
        // we didn't implement GetFlightInfo so we should get an Unimplemented
        // error, proving it did call into the base flight server. If we didn't
diff --git a/parquet/pqarrow/encode_arrow_test.go 
b/parquet/pqarrow/encode_arrow_test.go
index 78e9c68..0a2edab 100644
--- a/parquet/pqarrow/encode_arrow_test.go
+++ b/parquet/pqarrow/encode_arrow_test.go
@@ -2278,7 +2278,9 @@ func TestWriteTableMemoryAllocation(t *testing.T) {
 
 func TestEmptyListDeltaBinaryPacked(t *testing.T) {
        schema := arrow.NewSchema([]arrow.Field{
-               {Name: "ts", Type: arrow.ListOf(arrow.PrimitiveTypes.Uint64),
+               {Name: "ts", Type: arrow.ListOfField(
+                       arrow.Field{Name: "list", Type: 
arrow.PrimitiveTypes.Uint64, Nullable: true,
+                               Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}),
                        Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}}, nil)
        builder := array.NewRecordBuilder(memory.DefaultAllocator, schema)
        defer builder.Release()
diff --git a/parquet/pqarrow/schema.go b/parquet/pqarrow/schema.go
index 8eccd57..8642a2e 100644
--- a/parquet/pqarrow/schema.go
+++ b/parquet/pqarrow/schema.go
@@ -731,7 +731,10 @@ func listToSchemaField(n *schema.GroupNode, currentLevels 
file.LevelInfo, ctx *s
        }
 
        out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOfField(
-               arrow.Field{Name: listNode.Name(), Type: 
out.Children[0].Field.Type, Nullable: true}),
+               arrow.Field{Name: listNode.Name(),
+                       Type:     out.Children[0].Field.Type,
+                       Metadata: out.Children[0].Field.Metadata,
+                       Nullable: true}),
                Nullable: n.RepetitionType() == parquet.Repetitions.Optional, 
Metadata: createFieldMeta(int(n.FieldID()))}
 
        out.LevelInfo = currentLevels
@@ -826,7 +829,7 @@ func mapToSchemaField(n *schema.GroupNode, currentLevels 
file.LevelInfo, ctx *sc
                Nullable: false, Metadata: 
createFieldMeta(int(kvgroup.FieldID()))}
 
        kvfield.LevelInfo = currentLevels
-       out.Field = &arrow.Field{Name: n.Name(), Type: 
arrow.MapOf(keyField.Field.Type, valueField.Field.Type),
+       out.Field = &arrow.Field{Name: n.Name(), Type: 
arrow.MapOfFields(*keyField.Field, *valueField.Field),
                Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
                Metadata: createFieldMeta(int(n.FieldID()))}
        out.LevelInfo = currentLevels
diff --git a/parquet/pqarrow/schema_test.go b/parquet/pqarrow/schema_test.go
index 1c9d24a..598386d 100644
--- a/parquet/pqarrow/schema_test.go
+++ b/parquet/pqarrow/schema_test.go
@@ -370,11 +370,11 @@ func TestListStructBackwardCompatible(t *testing.T) {
        // the 3-level encoding. Schema looks like:
        //
        //     required group field_id=-1 root {
-       //       optional group field_id=-1 answers (List) {
+       //       optional group field_id=1 answers (List) {
        //                 repeated group field_id=-1 array {
-       //           optional byte_array field_id=-1 type (String);
-       //           optional byte_array field_id=-1 rdata (String);
-       //           optional byte_array field_id=-1 class (String);
+       //           optional byte_array field_id=2 type (String);
+       //           optional byte_array field_id=3 rdata (String);
+       //           optional byte_array field_id=4 class (String);
        //         }
        //       }
        //     }
@@ -382,12 +382,12 @@ func TestListStructBackwardCompatible(t *testing.T) {
        // Instead of the proper 3-level encoding which would be:
        //
        //     repeated group field_id=-1 schema {
-       //       optional group field_id=-1 answers (List) {
+       //       optional group field_id=1 answers (List) {
        //         repeated group field_id=-1 list {
        //           optional group field_id=-1 element {
-       //             optional byte_array field_id=-1 type (String);
-       //             optional byte_array field_id=-1 rdata (String);
-       //             optional byte_array field_id=-1 class (String);
+       //             optional byte_array field_id=2 type (String);
+       //             optional byte_array field_id=3 rdata (String);
+       //             optional byte_array field_id=4 class (String);
        //           }
        //         }
        //       }
@@ -397,25 +397,28 @@ func TestListStructBackwardCompatible(t *testing.T) {
                schema.Must(schema.NewGroupNodeLogical("answers", 
parquet.Repetitions.Optional, schema.FieldList{
                        schema.Must(schema.NewGroupNode("array", 
parquet.Repetitions.Repeated, schema.FieldList{
                                
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("type", 
parquet.Repetitions.Optional,
-                                       schema.StringLogicalType{}, 
parquet.Types.ByteArray, -1, -1)),
+                                       schema.StringLogicalType{}, 
parquet.Types.ByteArray, -1, 2)),
                                
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("rdata", 
parquet.Repetitions.Optional,
-                                       schema.StringLogicalType{}, 
parquet.Types.ByteArray, -1, -1)),
+                                       schema.StringLogicalType{}, 
parquet.Types.ByteArray, -1, 3)),
                                
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("class", 
parquet.Repetitions.Optional,
-                                       schema.StringLogicalType{}, 
parquet.Types.ByteArray, -1, -1)),
+                                       schema.StringLogicalType{}, 
parquet.Types.ByteArray, -1, 4)),
                        }, -1)),
-               }, schema.NewListLogicalType(), -1)),
+               }, schema.NewListLogicalType(), 1)),
        }, -1)))
 
-       meta := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
        // desired equivalent arrow schema would be list<item: struct<type: 
utf8, rdata: utf8, class: utf8>>
        arrowSchema := arrow.NewSchema(
                []arrow.Field{
                        {Name: "answers", Type: arrow.ListOfField(arrow.Field{
                                Name: "array", Type: arrow.StructOf(
-                                       arrow.Field{Name: "type", Type: 
arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
-                                       arrow.Field{Name: "rdata", Type: 
arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
-                                       arrow.Field{Name: "class", Type: 
arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
-                               ), Nullable: true}), Nullable: true, Metadata: 
meta},
+                                       arrow.Field{Name: "type", Type: 
arrow.BinaryTypes.String, Nullable: true,
+                                               Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"2"})},
+                                       arrow.Field{Name: "rdata", Type: 
arrow.BinaryTypes.String, Nullable: true,
+                                               Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"3"})},
+                                       arrow.Field{Name: "class", Type: 
arrow.BinaryTypes.String, Nullable: true,
+                                               Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"4"})},
+                               ), Nullable: true, Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}),
+                               Nullable: true, Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"1"})},
                }, nil)
 
        arrsc, err := pqarrow.FromParquet(pqSchema, nil, 
metadata.KeyValueMetadata{})

Reply via email to