This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 9a7277a27d GH-34101: [Go][Parquet] NewSchemaManifest creates wrong 
schema field (#34127)
9a7277a27d is described below

commit 9a7277a27d2d815d5ae6946e17e1ff45147c7480
Author: Matt Topol <[email protected]>
AuthorDate: Mon Feb 13 10:57:54 2023 -0500

    GH-34101: [Go][Parquet] NewSchemaManifest creates wrong schema field 
(#34127)
    
    
    
    ### Rationale for this change
    Bug was found in the handling of the special cases when converting Parquet 
Schemas to Arrow Schemas.
    
    ### What changes are included in this PR?
    Parquet Schemas containing a repeated group (list) field with multiple 
child fields, even when named "array" or "<list_name>_tuple" will properly 
convert to a list of struct now.
    
    ### Are these changes tested?
    Yes, unit test was added for this case.
    
    ### Are there any user-facing changes?
    Fixes a bug that caused users to get a list of a struct with a single field 
rather than a list of struct with multiple fields.
    
    * Closes: #34101
    
    Authored-by: Matt Topol <[email protected]>
    Signed-off-by: Matt Topol <[email protected]>
---
 go/parquet/pqarrow/schema.go      | 11 ++++----
 go/parquet/pqarrow/schema_test.go | 58 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/go/parquet/pqarrow/schema.go b/go/parquet/pqarrow/schema.go
index 8fb25cc80f..faa2c59b68 100644
--- a/go/parquet/pqarrow/schema.go
+++ b/go/parquet/pqarrow/schema.go
@@ -21,7 +21,6 @@ import (
        "fmt"
        "math"
        "strconv"
-       "strings"
 
        "github.com/apache/arrow/go/v12/arrow"
        "github.com/apache/arrow/go/v12/arrow/flight"
@@ -653,13 +652,13 @@ func listToSchemaField(n *schema.GroupNode, currentLevels 
file.LevelInfo, ctx *s
                //   If the name is array or ends in _tuple, this should be a 
list of struct
                //   even for single child elements.
                listGroup := listNode.(*schema.GroupNode)
-               if listGroup.NumFields() == 1 && (listGroup.Name() == "array" 
|| strings.HasSuffix(listGroup.Name(), "_tuple")) {
+               if listGroup.NumFields() == 1 && !(listGroup.Name() == "array" 
|| listGroup.Name() == (n.Name()+"_tuple")) {
                        // list of primitive type
-                       if err := groupToStructField(listGroup, currentLevels, 
ctx, out, &out.Children[0]); err != nil {
+                       if err := nodeToSchemaField(listGroup.Field(0), 
currentLevels, ctx, out, &out.Children[0]); err != nil {
                                return err
                        }
                } else {
-                       if err := nodeToSchemaField(listGroup.Field(0), 
currentLevels, ctx, out, &out.Children[0]); err != nil {
+                       if err := groupToStructField(listGroup, currentLevels, 
ctx, out, &out.Children[0]); err != nil {
                                return err
                        }
                }
@@ -680,8 +679,10 @@ func listToSchemaField(n *schema.GroupNode, currentLevels 
file.LevelInfo, ctx *s
                populateLeaf(colIndex, &itemField, currentLevels, ctx, out, 
&out.Children[0])
        }
 
-       out.Field = &arrow.Field{Name: n.Name(), Type: 
arrow.ListOf(out.Children[0].Field.Type),
+       out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOfField(
+               arrow.Field{Name: listNode.Name(), Type: 
out.Children[0].Field.Type, Nullable: true}),
                Nullable: n.RepetitionType() == parquet.Repetitions.Optional, 
Metadata: createFieldMeta(int(n.FieldID()))}
+
        out.LevelInfo = currentLevels
        // At this point current levels contains the def level for this list,
        // we need to reset to the prior parent.
diff --git a/go/parquet/pqarrow/schema_test.go 
b/go/parquet/pqarrow/schema_test.go
index e6de8ae250..07133fb400 100644
--- a/go/parquet/pqarrow/schema_test.go
+++ b/go/parquet/pqarrow/schema_test.go
@@ -309,3 +309,61 @@ func TestConvertArrowStruct(t *testing.T) {
                assert.Truef(t, 
parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", 
i, parquetSchema.Column(i).Name())
        }
 }
+
+func TestListStructBackwardCompatible(t *testing.T) {
+       // Set up old construction for list of struct, not using
+       // the 3-level encoding. Schema looks like:
+       //
+       //     required group field_id=-1 root {
+       //       optional group field_id=-1 answers (List) {
+       //                 repeated group field_id=-1 array {
+       //           optional byte_array field_id=-1 type (String);
+       //           optional byte_array field_id=-1 rdata (String);
+       //           optional byte_array field_id=-1 class (String);
+       //         }
+       //       }
+       //     }
+       //
+       // Instaed of the proper 3-level encoding which would be:
+       //
+       //     repeated group field_id=-1 schema {
+       //       optional group field_id=-1 answers (List) {
+       //         repeated group field_id=-1 list {
+       //           optional group field_id=-1 element {
+       //             optional byte_array field_id=-1 type (String);
+       //             optional byte_array field_id=-1 rdata (String);
+       //             optional byte_array field_id=-1 class (String);
+       //           }
+       //         }
+       //       }
+       //     }
+       //
+       pqSchema := 
schema.NewSchema(schema.MustGroup(schema.NewGroupNode("root", 
parquet.Repetitions.Required, schema.FieldList{
+               schema.Must(schema.NewGroupNodeLogical("answers", 
parquet.Repetitions.Optional, schema.FieldList{
+                       schema.Must(schema.NewGroupNode("array", 
parquet.Repetitions.Repeated, schema.FieldList{
+                               
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("type", 
parquet.Repetitions.Optional,
+                                       schema.StringLogicalType{}, 
parquet.Types.ByteArray, -1, -1)),
+                               
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("rdata", 
parquet.Repetitions.Optional,
+                                       schema.StringLogicalType{}, 
parquet.Types.ByteArray, -1, -1)),
+                               
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("class", 
parquet.Repetitions.Optional,
+                                       schema.StringLogicalType{}, 
parquet.Types.ByteArray, -1, -1)),
+                       }, -1)),
+               }, schema.NewListLogicalType(), -1)),
+       }, -1)))
+
+       meta := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
+       // desired equivalent arrow schema would be list<item: struct<type: 
utf8, rdata: utf8, class: utf8>>
+       arrowSchema := arrow.NewSchema(
+               []arrow.Field{
+                       {Name: "answers", Type: arrow.ListOfField(arrow.Field{
+                               Name: "array", Type: arrow.StructOf(
+                                       arrow.Field{Name: "type", Type: 
arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
+                                       arrow.Field{Name: "rdata", Type: 
arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
+                                       arrow.Field{Name: "class", Type: 
arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
+                               ), Nullable: true}), Nullable: true, Metadata: 
meta},
+               }, nil)
+
+       arrsc, err := pqarrow.FromParquet(pqSchema, nil, 
metadata.KeyValueMetadata{})
+       assert.NoError(t, err)
+       assert.True(t, arrowSchema.Equal(arrsc))
+}

Reply via email to