This is an automated email from the ASF dual-hosted git repository. zeroshade pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new f26f59ec fix: correctly initialize SchemaField.ColIndex (#591)
f26f59ec is described below
commit f26f59ec289e488f8b4ddf0d64eafd7cdd8e322c
Author: James Guthrie <[email protected]>
AuthorDate: Tue Dec 2 15:52:48 2025 +0000
fix: correctly initialize SchemaField.ColIndex (#591)
### Rationale for this change
The documentation for SchemaField says:
> ColIndex is only populated (not -1) when it is a leaf column.
But the implementation never actually initializes the value to -1, so it
is incorrectly set to 0 on non-leaf nodes.
### What changes are included in this PR?
This commit adds a helper function to correctly initialize SchemaField,
and comprehensive test cases to cover all existing branches.
### Are these changes tested?
Yes, see test cases.
### Are there any user-facing changes?
---
parquet/pqarrow/schema.go | 24 ++--
parquet/pqarrow/schema_test.go | 310 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 326 insertions(+), 8 deletions(-)
diff --git a/parquet/pqarrow/schema.go b/parquet/pqarrow/schema.go
index 7c56e333..aac64825 100644
--- a/parquet/pqarrow/schema.go
+++ b/parquet/pqarrow/schema.go
@@ -685,7 +685,7 @@ func listToSchemaField(n *schema.GroupNode, currentLevels
file.LevelInfo, ctx *s
currentLevels.Increment(n)
- out.Children = make([]SchemaField, n.NumFields())
+ out.Children = makeSchemaFields(n.NumFields())
ctx.LinkParent(out, parent)
ctx.LinkParent(&out.Children[0], out)
@@ -774,7 +774,7 @@ func listToSchemaField(n *schema.GroupNode, currentLevels
file.LevelInfo, ctx *s
func groupToStructField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx
*schemaTree, out *SchemaField) error {
arrowFields := make([]arrow.Field, 0, n.NumFields())
- out.Children = make([]SchemaField, n.NumFields())
+ out.Children = makeSchemaFields(n.NumFields())
for i := 0; i < n.NumFields(); i++ {
if err := nodeToSchemaField(n.Field(i), currentLevels, ctx,
out, &out.Children[i]); err != nil {
@@ -825,10 +825,10 @@ func mapToSchemaField(n *schema.GroupNode, currentLevels
file.LevelInfo, ctx *sc
currentLevels.Increment(n)
repeatedAncestorDef := currentLevels.IncrementRepeated()
- out.Children = make([]SchemaField, 1)
+ out.Children = makeSchemaFields(1)
kvfield := &out.Children[0]
- kvfield.Children = make([]SchemaField, 2)
+ kvfield.Children = makeSchemaFields(2)
keyField := &kvfield.Children[0]
valueField := &kvfield.Children[1]
@@ -876,7 +876,7 @@ func variantToSchemaField(n *schema.GroupNode,
currentLevels file.LevelInfo, ctx
if n.RepetitionType() == parquet.Repetitions.Repeated {
// list of variants
- out.Children = make([]SchemaField, 1)
+ out.Children = makeSchemaFields(1)
repeatedAncestorDef := currentLevels.IncrementRepeated()
if err := groupToStructField(n, currentLevels, ctx,
&out.Children[0]); err != nil {
return err
@@ -925,7 +925,7 @@ func groupToSchemaField(n *schema.GroupNode, currentLevels
file.LevelInfo, ctx *
// r/o TYPE[0] f0
// r/o TYPE[1] f1
// }
- out.Children = make([]SchemaField, 1)
+ out.Children = makeSchemaFields(1)
repeatedAncestorDef := currentLevels.IncrementRepeated()
if err := groupToStructField(n, currentLevels, ctx,
&out.Children[0]); err != nil {
return err
@@ -987,7 +987,7 @@ func nodeToSchemaField(n schema.Node, currentLevels
file.LevelInfo, ctx *schemaT
if primitive.RepetitionType() == parquet.Repetitions.Repeated {
// one-level list encoding e.g. a: repeated int32;
repeatedAncestorDefLevel := currentLevels.IncrementRepeated()
- out.Children = make([]SchemaField, 1)
+ out.Children = makeSchemaFields(1)
child := arrow.Field{Name: primitive.Name(), Type: arrowType,
Nullable: false}
populateLeaf(colIndex, &child, currentLevels, ctx, out,
&out.Children[0])
out.Field = &arrow.Field{Name: primitive.Name(), Type:
arrow.ListOf(child.Type), Nullable: false,
@@ -1206,7 +1206,7 @@ func NewSchemaManifest(sc *schema.Schema, meta
metadata.KeyValueMetadata, props
ColIndexToField: make(map[int]*SchemaField),
ChildToParent: make(map[*SchemaField]*SchemaField),
descr: sc,
- Fields: make([]SchemaField, sc.Root().NumFields()),
+ Fields: makeSchemaFields(sc.Root().NumFields()),
}
ctx.props = props
if ctx.props == nil {
@@ -1258,3 +1258,11 @@ func FromParquet(sc *schema.Schema, props
*ArrowReadProperties, kv metadata.KeyV
}
return arrow.NewSchema(fields, manifest.SchemaMeta), nil
}
+
+func makeSchemaFields(n int) []SchemaField {
+ fields := make([]SchemaField, n)
+ for i := range fields {
+ fields[i].ColIndex = -1
+ }
+ return fields
+}
diff --git a/parquet/pqarrow/schema_test.go b/parquet/pqarrow/schema_test.go
index 6f5d14c7..e6c7a5a3 100644
--- a/parquet/pqarrow/schema_test.go
+++ b/parquet/pqarrow/schema_test.go
@@ -426,6 +426,316 @@ func TestListStructBackwardCompatible(t *testing.T) {
assert.True(t, arrowSchema.Equal(arrsc), arrowSchema.String(),
arrsc.String())
}
+func TestNestedFieldIndex(t *testing.T) {
+ cases := []struct {
+ name string
+ s *schema.Schema
+ }{
+ {
+ name: "RepeatedPrimitive",
+ s: schema.NewSchema(schema.MustGroup(
+ schema.NewGroupNode("schema",
parquet.Repetitions.Required, schema.FieldList{
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "tags",
+ parquet.Repetitions.Repeated,
+ schema.StringLogicalType{},
+ parquet.Types.ByteArray,
+ 0,
+ -1,
+ )),
+ }, -1),
+ )),
+ }, {
+ name: "SimpleList",
+ s: schema.NewSchema(schema.MustGroup(
+ schema.NewGroupNode("schema",
parquet.Repetitions.Required, schema.FieldList{
+ schema.MustGroup(schema.ListOf(
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "element",
+
parquet.Repetitions.Required,
+
schema.StringLogicalType{},
+ parquet.Types.ByteArray,
+ 0,
+ -1,
+ )),
+ parquet.Repetitions.Required,
+ -1,
+ )),
+ }, -1),
+ )),
+ },
+ {
+ name: "ListOfOptionalElements",
+ s: schema.NewSchema(schema.MustGroup(
+ schema.NewGroupNode("schema",
parquet.Repetitions.Required, schema.FieldList{
+ schema.MustGroup(schema.ListOf(
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "element",
+
parquet.Repetitions.Optional,
+
schema.StringLogicalType{},
+ parquet.Types.ByteArray,
+ 1,
+ -1,
+ )),
+ parquet.Repetitions.Required,
+ -1,
+ )),
+ }, -1),
+ )),
+ },
+ {
+ name: "NestedStruct",
+ s: schema.NewSchema(schema.MustGroup(
+ schema.NewGroupNode("schema",
parquet.Repetitions.Required, schema.FieldList{
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "name",
+ parquet.Repetitions.Required,
+ schema.StringLogicalType{},
+ parquet.Types.ByteArray,
+ 0,
+ -1,
+ )),
+
schema.MustGroup(schema.NewGroupNode("address", parquet.Repetitions.Required,
schema.FieldList{
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "city",
+
parquet.Repetitions.Required,
+
schema.StringLogicalType{},
+ parquet.Types.ByteArray,
+ 1,
+ -1,
+ )),
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "street",
+
parquet.Repetitions.Optional,
+
schema.StringLogicalType{},
+ parquet.Types.ByteArray,
+ 2,
+ -1,
+ )),
+ }, 3)),
+ }, -1),
+ )),
+ },
+ {
+ name: "ListOfStructs",
+ s: schema.NewSchema(schema.MustGroup(
+ schema.NewGroupNode("schema",
parquet.Repetitions.Required, schema.FieldList{
+ schema.MustGroup(schema.ListOf(
+
schema.MustGroup(schema.NewGroupNode("contact", parquet.Repetitions.Required,
schema.FieldList{
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "email",
+
parquet.Repetitions.Required,
+
schema.StringLogicalType{},
+
parquet.Types.ByteArray,
+ 1,
+ -1,
+ )),
+ }, 2)),
+ parquet.Repetitions.Required,
+ -1,
+ )),
+ }, -1),
+ )),
+ },
+ {
+ name: "MapRequiredValues",
+ s: schema.NewSchema(schema.MustGroup(
+ schema.NewGroupNode("schema",
parquet.Repetitions.Required, schema.FieldList{
+ schema.MustGroup(schema.MapOf(
+ "attrs",
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "key",
+
parquet.Repetitions.Required,
+
schema.StringLogicalType{},
+ parquet.Types.ByteArray,
+ 1,
+ -1,
+ )),
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "value",
+
parquet.Repetitions.Required,
+
schema.StringLogicalType{},
+ parquet.Types.ByteArray,
+ 2,
+ -1,
+ )),
+ parquet.Repetitions.Required,
+ 0,
+ )),
+ }, -1),
+ )),
+ },
+ {
+ name: "RepeatedGroup",
+ s: schema.NewSchema(schema.MustGroup(
+ schema.NewGroupNode("schema",
parquet.Repetitions.Required, schema.FieldList{
+
schema.MustGroup(schema.NewGroupNode("items", parquet.Repetitions.Repeated,
schema.FieldList{
+
schema.Must(schema.NewPrimitiveNode(
+ "id",
+
parquet.Repetitions.Required,
+ parquet.Types.Int32,
+ 0,
+ -1,
+ )),
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "tag",
+
parquet.Repetitions.Optional,
+
schema.StringLogicalType{},
+ parquet.Types.ByteArray,
+ 1,
+ -1,
+ )),
+ }, 2)),
+ }, -1),
+ )),
+ },
+ {
+ name: "DeeplyNested",
+ s: schema.NewSchema(schema.MustGroup(
+ schema.NewGroupNode("schema",
parquet.Repetitions.Required, schema.FieldList{
+
schema.MustGroup(schema.NewGroupNode("level1", parquet.Repetitions.Required,
schema.FieldList{
+
schema.MustGroup(schema.NewGroupNode("level2", parquet.Repetitions.Required,
schema.FieldList{
+
schema.MustGroup(schema.ListOf(
+
schema.Must(schema.NewPrimitiveNode(
+
"element",
+
parquet.Repetitions.Required,
+
parquet.Types.Int32,
+ 2,
+ -1,
+ )),
+
parquet.Repetitions.Required,
+ -1,
+ )),
+ }, 1)),
+ }, 0)),
+ }, -1),
+ )),
+ },
+ {
+ name: "ListOfMaps",
+ s: schema.NewSchema(schema.MustGroup(
+ schema.NewGroupNode("schema",
parquet.Repetitions.Required, schema.FieldList{
+ schema.MustGroup(schema.ListOf(
+ schema.MustGroup(schema.MapOf(
+ "map_item",
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "key",
+
parquet.Repetitions.Required,
+
schema.StringLogicalType{},
+
parquet.Types.ByteArray,
+ 1,
+ -1,
+ )),
+
schema.Must(schema.NewPrimitiveNode(
+ "value",
+
parquet.Repetitions.Required,
+
parquet.Types.Int32,
+ 2,
+ -1,
+ )),
+
parquet.Repetitions.Required,
+ 0,
+ )),
+ parquet.Repetitions.Required,
+ -1,
+ )),
+ }, -1),
+ )),
+ },
+ {
+ name: "RepeatedVariant",
+ s: schema.NewSchema(schema.MustGroup(
+ schema.NewGroupNode("schema",
parquet.Repetitions.Required, schema.FieldList{
+
schema.MustGroup(schema.NewGroupNodeLogical(
+ "variant_list",
+ parquet.Repetitions.Repeated,
+ schema.FieldList{
+
schema.Must(schema.NewPrimitiveNode(
+ "metadata",
+
parquet.Repetitions.Required,
+
parquet.Types.ByteArray,
+ 1,
+ -1,
+ )),
+
schema.Must(schema.NewPrimitiveNode(
+ "value",
+
parquet.Repetitions.Optional,
+
parquet.Types.ByteArray,
+ 2,
+ -1,
+ )),
+ },
+ schema.VariantLogicalType{},
+ 0,
+ )),
+ }, -1),
+ )),
+ },
+ {
+ name: "MapNestedKeyAndValue",
+ s: schema.NewSchema(schema.MustGroup(
+ schema.NewGroupNode("schema",
parquet.Repetitions.Required, schema.FieldList{
+ schema.MustGroup(schema.MapOf(
+ "attrs",
+ // Nested key: struct with id
and name
+
schema.MustGroup(schema.NewGroupNode("key_struct",
parquet.Repetitions.Required, schema.FieldList{
+
schema.Must(schema.NewPrimitiveNode(
+ "id",
+
parquet.Repetitions.Required,
+
parquet.Types.Int32,
+ 1,
+ -1,
+ )),
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "name",
+
parquet.Repetitions.Required,
+
schema.StringLogicalType{},
+
parquet.Types.ByteArray,
+ 2,
+ -1,
+ )),
+ }, 3)),
+ // Nested value: list of strings
+ schema.MustGroup(schema.ListOf(
+
schema.Must(schema.NewPrimitiveNodeLogical(
+ "element",
+
parquet.Repetitions.Required,
+
schema.StringLogicalType{},
+
parquet.Types.ByteArray,
+ 5,
+ -1,
+ )),
+
parquet.Repetitions.Required,
+ -1,
+ )),
+ parquet.Repetitions.Required,
+ 0,
+ )),
+ }, -1),
+ )),
+ },
+ }
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ manifest, err := pqarrow.NewSchemaManifest(tc.s,
metadata.KeyValueMetadata{}, nil)
+ assert.NoError(t, err)
+ assertFieldColIndexCorrect(t, manifest.Fields)
+ })
+ }
+}
+
+func assertFieldColIndexCorrect(t *testing.T, fields []pqarrow.SchemaField) {
+ for _, field := range fields {
+ if field.Children != nil {
+ assert.Equal(t, -1, field.ColIndex)
+ assertFieldColIndexCorrect(t, field.Children)
+ } else {
+ assert.NotEqual(t, -1, field.Children)
+ }
+ }
+}
+
// TestUnsupportedTypes tests the error message for unsupported types. This
test should be updated
// when support for these types is added.
func TestUnsupportedTypes(t *testing.T) {
