This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 9a7277a27d GH-34101: [Go][Parquet] NewSchemaManifest creates wrong
schema field (#34127)
9a7277a27d is described below
commit 9a7277a27d2d815d5ae6946e17e1ff45147c7480
Author: Matt Topol <[email protected]>
AuthorDate: Mon Feb 13 10:57:54 2023 -0500
GH-34101: [Go][Parquet] NewSchemaManifest creates wrong schema field
(#34127)
### Rationale for this change
Bug was found in the handling of the special cases when converting Parquet
Schemas to Arrow Schemas.
### What changes are included in this PR?
Parquet Schemas containing a repeated group (list) field with multiple
child fields, even when named "array" or "<list_name>_tuple" will properly
convert to a list of struct now.
### Are these changes tested?
Yes, unit test was added for this case.
### Are there any user-facing changes?
Fixes a bug that caused users to get a list of a struct with a single field
rather than a list of struct with multiple fields.
* Closes: #34101
Authored-by: Matt Topol <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
---
go/parquet/pqarrow/schema.go | 11 ++++----
go/parquet/pqarrow/schema_test.go | 58 +++++++++++++++++++++++++++++++++++++++
2 files changed, 64 insertions(+), 5 deletions(-)
diff --git a/go/parquet/pqarrow/schema.go b/go/parquet/pqarrow/schema.go
index 8fb25cc80f..faa2c59b68 100644
--- a/go/parquet/pqarrow/schema.go
+++ b/go/parquet/pqarrow/schema.go
@@ -21,7 +21,6 @@ import (
"fmt"
"math"
"strconv"
- "strings"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/flight"
@@ -653,13 +652,13 @@ func listToSchemaField(n *schema.GroupNode, currentLevels
file.LevelInfo, ctx *s
// If the name is array or ends in _tuple, this should be a
list of struct
// even for single child elements.
listGroup := listNode.(*schema.GroupNode)
- if listGroup.NumFields() == 1 && (listGroup.Name() == "array"
|| strings.HasSuffix(listGroup.Name(), "_tuple")) {
+ if listGroup.NumFields() == 1 && !(listGroup.Name() == "array"
|| listGroup.Name() == (n.Name()+"_tuple")) {
// list of primitive type
- if err := groupToStructField(listGroup, currentLevels,
ctx, out, &out.Children[0]); err != nil {
+ if err := nodeToSchemaField(listGroup.Field(0),
currentLevels, ctx, out, &out.Children[0]); err != nil {
return err
}
} else {
- if err := nodeToSchemaField(listGroup.Field(0),
currentLevels, ctx, out, &out.Children[0]); err != nil {
+ if err := groupToStructField(listGroup, currentLevels,
ctx, out, &out.Children[0]); err != nil {
return err
}
}
@@ -680,8 +679,10 @@ func listToSchemaField(n *schema.GroupNode, currentLevels
file.LevelInfo, ctx *s
populateLeaf(colIndex, &itemField, currentLevels, ctx, out,
&out.Children[0])
}
- out.Field = &arrow.Field{Name: n.Name(), Type:
arrow.ListOf(out.Children[0].Field.Type),
+ out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOfField(
+ arrow.Field{Name: listNode.Name(), Type:
out.Children[0].Field.Type, Nullable: true}),
Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
Metadata: createFieldMeta(int(n.FieldID()))}
+
out.LevelInfo = currentLevels
// At this point current levels contains the def level for this list,
// we need to reset to the prior parent.
diff --git a/go/parquet/pqarrow/schema_test.go
b/go/parquet/pqarrow/schema_test.go
index e6de8ae250..07133fb400 100644
--- a/go/parquet/pqarrow/schema_test.go
+++ b/go/parquet/pqarrow/schema_test.go
@@ -309,3 +309,61 @@ func TestConvertArrowStruct(t *testing.T) {
assert.Truef(t,
parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s",
i, parquetSchema.Column(i).Name())
}
}
+
+func TestListStructBackwardCompatible(t *testing.T) {
+ // Set up old construction for list of struct, not using
+ // the 3-level encoding. Schema looks like:
+ //
+ // required group field_id=-1 root {
+ // optional group field_id=-1 answers (List) {
+ // repeated group field_id=-1 array {
+ // optional byte_array field_id=-1 type (String);
+ // optional byte_array field_id=-1 rdata (String);
+ // optional byte_array field_id=-1 class (String);
+ // }
+ // }
+ // }
+ //
+ // Instaed of the proper 3-level encoding which would be:
+ //
+ // repeated group field_id=-1 schema {
+ // optional group field_id=-1 answers (List) {
+ // repeated group field_id=-1 list {
+ // optional group field_id=-1 element {
+ // optional byte_array field_id=-1 type (String);
+ // optional byte_array field_id=-1 rdata (String);
+ // optional byte_array field_id=-1 class (String);
+ // }
+ // }
+ // }
+ // }
+ //
+ pqSchema :=
schema.NewSchema(schema.MustGroup(schema.NewGroupNode("root",
parquet.Repetitions.Required, schema.FieldList{
+ schema.Must(schema.NewGroupNodeLogical("answers",
parquet.Repetitions.Optional, schema.FieldList{
+ schema.Must(schema.NewGroupNode("array",
parquet.Repetitions.Repeated, schema.FieldList{
+
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("type",
parquet.Repetitions.Optional,
+ schema.StringLogicalType{},
parquet.Types.ByteArray, -1, -1)),
+
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("rdata",
parquet.Repetitions.Optional,
+ schema.StringLogicalType{},
parquet.Types.ByteArray, -1, -1)),
+
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("class",
parquet.Repetitions.Optional,
+ schema.StringLogicalType{},
parquet.Types.ByteArray, -1, -1)),
+ }, -1)),
+ }, schema.NewListLogicalType(), -1)),
+ }, -1)))
+
+ meta := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
+ // desired equivalent arrow schema would be list<item: struct<type:
utf8, rdata: utf8, class: utf8>>
+ arrowSchema := arrow.NewSchema(
+ []arrow.Field{
+ {Name: "answers", Type: arrow.ListOfField(arrow.Field{
+ Name: "array", Type: arrow.StructOf(
+ arrow.Field{Name: "type", Type:
arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
+ arrow.Field{Name: "rdata", Type:
arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
+ arrow.Field{Name: "class", Type:
arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
+ ), Nullable: true}), Nullable: true, Metadata:
meta},
+ }, nil)
+
+ arrsc, err := pqarrow.FromParquet(pqSchema, nil,
metadata.KeyValueMetadata{})
+ assert.NoError(t, err)
+ assert.True(t, arrowSchema.Equal(arrsc))
+}