This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new ca6e0eb fix(parquet/pqarrow): propagate field id metadata for
lists/maps (#293)
ca6e0eb is described below
commit ca6e0eb477c57a16edd787f6eba4c097a11d9c7a
Author: Matt Topol <[email protected]>
AuthorDate: Fri Feb 21 12:50:54 2025 -0500
fix(parquet/pqarrow): propagate field id metadata for lists/maps (#293)
### Rationale for this change
Discovered while fixing https://github.com/apache/iceberg-go/issues/309
we didn't correctly propagate the field-id metadata to children of List
or Map fields, only structs.
### What changes are included in this PR?
A new MapType creator for constructing MapTypes from arrow fields for
the Key and Items for easier construction, and fixing the `pqarrow`
schema manifest creation to correctly propagate the child metadata field
IDs for the children.
### Are these changes tested?
Unit test is added.
### Are there any user-facing changes?
Usage of pqarrow reading List/Map typed fields will now correctly
contain the `PARQUET:field_id` metadata key in the schema produced.
---
arrow/array/util.go | 2 +-
arrow/datatype_nested.go | 17 +++++++++++++++++
arrow/flight/server_example_test.go | 3 ---
parquet/pqarrow/encode_arrow_test.go | 4 +++-
parquet/pqarrow/schema.go | 7 +++++--
parquet/pqarrow/schema_test.go | 37 +++++++++++++++++++-----------------
6 files changed, 46 insertions(+), 24 deletions(-)
diff --git a/arrow/array/util.go b/arrow/array/util.go
index e21290f..53a1fdd 100644
--- a/arrow/array/util.go
+++ b/arrow/array/util.go
@@ -504,7 +504,7 @@ func (n *nullArrayFactory) create() *Data {
return out
}
-func (n *nullArrayFactory) createChild(dt arrow.DataType, i, length int) *Data
{
+func (n *nullArrayFactory) createChild(_ arrow.DataType, i, length int) *Data {
childFactory := &nullArrayFactory{
mem: n.mem, dt: n.dt.(arrow.NestedType).Fields()[i].Type,
len: length, buf: n.buf}
diff --git a/arrow/datatype_nested.go b/arrow/datatype_nested.go
index 32c3ed0..6664c50 100644
--- a/arrow/datatype_nested.go
+++ b/arrow/datatype_nested.go
@@ -536,6 +536,23 @@ func MapOf(key, item DataType) *MapType {
return &MapType{value: ListOf(StructOf(Field{Name: "key", Type: key},
Field{Name: "value", Type: item, Nullable: true}))}
}
+func MapOfFields(key, item Field) *MapType {
+ if key.Type == nil || item.Type == nil {
+ panic("arrow: nil key or item type for MapType")
+ }
+
+ if key.Nullable {
+ panic("arrow: key field must be non-nullable")
+ }
+
+ key.Name = "key"
+ item.Name = "value"
+ return &MapType{value: ListOfField(Field{
+ Name: "entries",
+ Type: StructOf(key, item),
+ })}
+}
+
func MapOfWithMetadata(key DataType, keyMetadata Metadata, item DataType,
itemMetadata Metadata) *MapType {
if key == nil || item == nil {
panic("arrow: nil key or item type for MapType")
diff --git a/arrow/flight/server_example_test.go
b/arrow/flight/server_example_test.go
index e41482c..2c54a66 100644
--- a/arrow/flight/server_example_test.go
+++ b/arrow/flight/server_example_test.go
@@ -64,9 +64,6 @@ func ExampleRegisterFlightServiceServer() {
fmt.Println(rsp.Status)
fc := flight.NewClientFromConn(conn, nil)
- if err != nil {
- panic(err)
- }
// we didn't implement GetFlightInfo so we should get an Unimplemented
// error, proving it did call into the base flight server. If we didn't
diff --git a/parquet/pqarrow/encode_arrow_test.go
b/parquet/pqarrow/encode_arrow_test.go
index 78e9c68..0a2edab 100644
--- a/parquet/pqarrow/encode_arrow_test.go
+++ b/parquet/pqarrow/encode_arrow_test.go
@@ -2278,7 +2278,9 @@ func TestWriteTableMemoryAllocation(t *testing.T) {
func TestEmptyListDeltaBinaryPacked(t *testing.T) {
schema := arrow.NewSchema([]arrow.Field{
- {Name: "ts", Type: arrow.ListOf(arrow.PrimitiveTypes.Uint64),
+ {Name: "ts", Type: arrow.ListOfField(
+ arrow.Field{Name: "list", Type:
arrow.PrimitiveTypes.Uint64, Nullable: true,
+ Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}),
Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}}, nil)
builder := array.NewRecordBuilder(memory.DefaultAllocator, schema)
defer builder.Release()
diff --git a/parquet/pqarrow/schema.go b/parquet/pqarrow/schema.go
index 8eccd57..8642a2e 100644
--- a/parquet/pqarrow/schema.go
+++ b/parquet/pqarrow/schema.go
@@ -731,7 +731,10 @@ func listToSchemaField(n *schema.GroupNode, currentLevels
file.LevelInfo, ctx *s
}
out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOfField(
- arrow.Field{Name: listNode.Name(), Type:
out.Children[0].Field.Type, Nullable: true}),
+ arrow.Field{Name: listNode.Name(),
+ Type: out.Children[0].Field.Type,
+ Metadata: out.Children[0].Field.Metadata,
+ Nullable: true}),
Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
Metadata: createFieldMeta(int(n.FieldID()))}
out.LevelInfo = currentLevels
@@ -826,7 +829,7 @@ func mapToSchemaField(n *schema.GroupNode, currentLevels
file.LevelInfo, ctx *sc
Nullable: false, Metadata:
createFieldMeta(int(kvgroup.FieldID()))}
kvfield.LevelInfo = currentLevels
- out.Field = &arrow.Field{Name: n.Name(), Type:
arrow.MapOf(keyField.Field.Type, valueField.Field.Type),
+ out.Field = &arrow.Field{Name: n.Name(), Type:
arrow.MapOfFields(*keyField.Field, *valueField.Field),
Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
Metadata: createFieldMeta(int(n.FieldID()))}
out.LevelInfo = currentLevels
diff --git a/parquet/pqarrow/schema_test.go b/parquet/pqarrow/schema_test.go
index 1c9d24a..598386d 100644
--- a/parquet/pqarrow/schema_test.go
+++ b/parquet/pqarrow/schema_test.go
@@ -370,11 +370,11 @@ func TestListStructBackwardCompatible(t *testing.T) {
// the 3-level encoding. Schema looks like:
//
// required group field_id=-1 root {
- // optional group field_id=-1 answers (List) {
+ // optional group field_id=1 answers (List) {
// repeated group field_id=-1 array {
- // optional byte_array field_id=-1 type (String);
- // optional byte_array field_id=-1 rdata (String);
- // optional byte_array field_id=-1 class (String);
+ // optional byte_array field_id=2 type (String);
+ // optional byte_array field_id=3 rdata (String);
+ // optional byte_array field_id=4 class (String);
// }
// }
// }
@@ -382,12 +382,12 @@ func TestListStructBackwardCompatible(t *testing.T) {
// Instead of the proper 3-level encoding which would be:
//
// repeated group field_id=-1 schema {
- // optional group field_id=-1 answers (List) {
+ // optional group field_id=1 answers (List) {
// repeated group field_id=-1 list {
// optional group field_id=-1 element {
- // optional byte_array field_id=-1 type (String);
- // optional byte_array field_id=-1 rdata (String);
- // optional byte_array field_id=-1 class (String);
+ // optional byte_array field_id=2 type (String);
+ // optional byte_array field_id=3 rdata (String);
+ // optional byte_array field_id=4 class (String);
// }
// }
// }
@@ -397,25 +397,28 @@ func TestListStructBackwardCompatible(t *testing.T) {
schema.Must(schema.NewGroupNodeLogical("answers",
parquet.Repetitions.Optional, schema.FieldList{
schema.Must(schema.NewGroupNode("array",
parquet.Repetitions.Repeated, schema.FieldList{
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("type",
parquet.Repetitions.Optional,
- schema.StringLogicalType{},
parquet.Types.ByteArray, -1, -1)),
+ schema.StringLogicalType{},
parquet.Types.ByteArray, -1, 2)),
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("rdata",
parquet.Repetitions.Optional,
- schema.StringLogicalType{},
parquet.Types.ByteArray, -1, -1)),
+ schema.StringLogicalType{},
parquet.Types.ByteArray, -1, 3)),
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("class",
parquet.Repetitions.Optional,
- schema.StringLogicalType{},
parquet.Types.ByteArray, -1, -1)),
+ schema.StringLogicalType{},
parquet.Types.ByteArray, -1, 4)),
}, -1)),
- }, schema.NewListLogicalType(), -1)),
+ }, schema.NewListLogicalType(), 1)),
}, -1)))
- meta := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
// desired equivalent arrow schema would be list<item: struct<type:
utf8, rdata: utf8, class: utf8>>
arrowSchema := arrow.NewSchema(
[]arrow.Field{
{Name: "answers", Type: arrow.ListOfField(arrow.Field{
Name: "array", Type: arrow.StructOf(
- arrow.Field{Name: "type", Type:
arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
- arrow.Field{Name: "rdata", Type:
arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
- arrow.Field{Name: "class", Type:
arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
- ), Nullable: true}), Nullable: true, Metadata:
meta},
+ arrow.Field{Name: "type", Type:
arrow.BinaryTypes.String, Nullable: true,
+ Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"2"})},
+ arrow.Field{Name: "rdata", Type:
arrow.BinaryTypes.String, Nullable: true,
+ Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"3"})},
+ arrow.Field{Name: "class", Type:
arrow.BinaryTypes.String, Nullable: true,
+ Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"4"})},
+ ), Nullable: true, Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}),
+ Nullable: true, Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"1"})},
}, nil)
arrsc, err := pqarrow.FromParquet(pqSchema, nil,
metadata.KeyValueMetadata{})