zeroshade commented on code in PR #395:
URL: https://github.com/apache/arrow-go/pull/395#discussion_r2121923457


##########
arrow/extensions/variant.go:
##########
@@ -0,0 +1,1536 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions
+
+import (
+       "fmt"
+       "math"
+       "reflect"
+       "strings"
+       "sync"
+
+       "github.com/apache/arrow-go/v18/arrow"
+       "github.com/apache/arrow-go/v18/arrow/array"
+       "github.com/apache/arrow-go/v18/arrow/decimal"
+       "github.com/apache/arrow-go/v18/arrow/decimal128"
+       "github.com/apache/arrow-go/v18/arrow/internal/debug"
+       "github.com/apache/arrow-go/v18/arrow/memory"
+       "github.com/apache/arrow-go/v18/internal/json"
+       "github.com/apache/arrow-go/v18/parquet/schema"
+       "github.com/apache/arrow-go/v18/parquet/variant"
+       "github.com/google/uuid"
+)
+
+// VariantType is the arrow extension type for representing Variant values as
+// defined by the Parquet Variant specification for encoding and shredding 
values.
+// The underlying storage must be a struct type with a minimum of two fields
+// ("metadata" and "value") and an optional third field ("typed_value").
+//
+// See the documentation for [NewVariantType] for the rules for creating a 
variant
+// type.
+type VariantType struct {
+       arrow.ExtensionBase
+
+       metadataFieldIdx   int
+       valueFieldIdx      int
+       typedValueFieldIdx int
+}
+
+// NewDefaultVariantType creates a basic, non-shredded variant type. The 
underlying
+// storage type will be struct<metadata: binary required, value: binary 
required>.
+func NewDefaultVariantType() *VariantType {
+       s := arrow.StructOf(
+               arrow.Field{Name: "metadata", Type: arrow.BinaryTypes.Binary, 
Nullable: false},
+               arrow.Field{Name: "value", Type: arrow.BinaryTypes.Binary, 
Nullable: false})
+
+       vt, _ := NewVariantType(s)
+       return vt
+}
+
+// NewVariantType creates a new variant type based on the provided storage 
type.
+//
+// The rules for a variant storage type are:
+//  1. MUST be a struct
+//  2. MUST have required field named "metadata" that is 
binary/largebinary/binary_view
+//  3. Must satisfy exactly one of the following:
+//     a. MUST have required field named "value" that is 
binary/largebinary/binary_view
+//     b. MUST have an optional field named "value" that is 
binary/largebinary/binary_view
+//     and another optional field named "typed_value" that is either a 
primitive type or
+//     a list/large_list/list_view or struct which also satisfies the 
following requirements:
+//     i. The elements must be REQUIRED
+//     ii. There must either be a single REQUIRED field named "value" which is
+//     binary/largebinary/binary_view or havve an optional "value" field and 
an optional
+//     "typed_value" field that follows the rules laid out in (b).
+//
+// The metadata field may also be dictionary encoded
+func NewVariantType(storage arrow.DataType) (*VariantType, error) {
+       s, ok := storage.(*arrow.StructType)
+       if !ok {
+               return nil, fmt.Errorf("%w: bad storage type %s for variant 
type", arrow.ErrInvalid, storage)
+       }
+
+       var (
+               metadataFieldIdx   = -1
+               valueFieldIdx      = -1
+               typedValueFieldIdx = -1
+       )
+
+       if metadataFieldIdx, ok = s.FieldIdx("metadata"); !ok {
+               return nil, fmt.Errorf("%w: missing required field 'metadata' 
in variant storage type %s", arrow.ErrInvalid, storage)
+       }
+
+       if valueFieldIdx, ok = s.FieldIdx("value"); !ok {
+               return nil, fmt.Errorf("%w: missing required field 'value' in 
variant storage type %s", arrow.ErrInvalid, storage)
+       }
+
+       if s.NumFields() > 3 {
+               return nil, fmt.Errorf("%w: too many fields in variant storage 
type %s, expected 2 or 3", arrow.ErrInvalid, storage)
+       }
+
+       if s.NumFields() == 3 {
+               if typedValueFieldIdx, ok = s.FieldIdx("typed_value"); !ok {
+                       return nil, fmt.Errorf("%w: has 3 fields, but missing 
'typed_value' field, %s", arrow.ErrInvalid, storage)
+               }
+       }
+
+       mdField, valField := s.Field(metadataFieldIdx), s.Field(valueFieldIdx)
+       if mdField.Nullable {
+               return nil, fmt.Errorf("%w: metadata field must be non-nullable 
binary type, got %s", arrow.ErrInvalid, mdField.Type)
+       }
+
+       if !isBinary(mdField.Type) {
+               if mdField.Type.ID() != arrow.DICTIONARY || 
!isBinary(mdField.Type.(*arrow.DictionaryType).ValueType) {
+                       return nil, fmt.Errorf("%w: metadata field must be 
non-nullable binary type, got %s", arrow.ErrInvalid, mdField.Type)
+               }
+       }
+
+       if !isBinary(valField.Type) || (valField.Nullable && typedValueFieldIdx 
== -1) {
+               return nil, fmt.Errorf("%w: value field must be non-nullable 
binary type, got %s", arrow.ErrInvalid, valField.Type)
+       }
+
+       if typedValueFieldIdx == -1 {
+               return &VariantType{
+                       ExtensionBase:      arrow.ExtensionBase{Storage: 
storage},
+                       metadataFieldIdx:   metadataFieldIdx,
+                       valueFieldIdx:      valueFieldIdx,
+                       typedValueFieldIdx: -1,
+               }, nil
+       }
+
+       valueField := s.Field(valueFieldIdx)
+       if !valueField.Nullable {
+               return nil, fmt.Errorf("%w: value field must be nullable if 
typed_value is present, got %s", arrow.ErrInvalid, valueField.Type)
+       }
+
+       typedValueField := s.Field(typedValueFieldIdx)
+       if !typedValueField.Nullable {
+               return nil, fmt.Errorf("%w: typed_value field must be nullable, 
got %s", arrow.ErrInvalid, typedValueField.Type)
+       }
+
+       if nt, ok := typedValueField.Type.(arrow.NestedType); ok {
+               if !validNestedType(nt) {
+                       return nil, fmt.Errorf("%w: typed_value field must be a 
valid nested type, got %s", arrow.ErrInvalid, typedValueField.Type)
+               }
+       }
+
+       return &VariantType{
+               ExtensionBase:      arrow.ExtensionBase{Storage: storage},
+               metadataFieldIdx:   metadataFieldIdx,
+               valueFieldIdx:      valueFieldIdx,
+               typedValueFieldIdx: typedValueFieldIdx,
+       }, nil
+}
+
+func (*VariantType) ArrayType() reflect.Type {
+       return reflect.TypeOf(VariantArray{})
+}
+
+func (v *VariantType) Metadata() arrow.Field {
+       return v.StorageType().(*arrow.StructType).Field(v.metadataFieldIdx)
+}
+
+func (v *VariantType) Value() arrow.Field {
+       return v.StorageType().(*arrow.StructType).Field(v.valueFieldIdx)
+}
+
+func (*VariantType) ExtensionName() string { return "parquet.variant" }
+
+func (v *VariantType) String() string {
+       return fmt.Sprintf("extension<%s>", v.ExtensionName())
+}
+
+func (v *VariantType) ExtensionEquals(other arrow.ExtensionType) bool {
+       return v.ExtensionName() == other.ExtensionName() &&
+               arrow.TypeEqual(v.Storage, other.StorageType())
+}
+
+func (*VariantType) Serialize() string { return "" }
+func (*VariantType) Deserialize(storageType arrow.DataType, _ string) 
(arrow.ExtensionType, error) {
+       return NewVariantType(storageType)
+}
+
+func (*VariantType) ParquetLogicalType() schema.LogicalType {
+       return schema.VariantLogicalType{}
+}
+
+func (v *VariantType) NewBuilder(mem memory.Allocator) array.Builder {
+       return NewVariantBuilder(mem, v)
+}
+
+func isBinary(dt arrow.DataType) bool {
+       return dt.ID() == arrow.BINARY || dt.ID() == arrow.LARGE_BINARY ||
+               dt.ID() == arrow.BINARY_VIEW
+}

Review Comment:
   hmm, `IsBinaryLike` returns true for both String and Binary, whereas we want 
this to only be binary so I don't think that updating `IsBinaryLike` would work 
here.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to