[GitHub] [iceberg] zeroshade commented on a diff in pull request #8122: Go: Schema and Types

via GitHub Mon, 24 Jul 2023 08:33:20 -0700


zeroshade commented on code in PR #8122:
URL: https://github.com/apache/iceberg/pull/8122#discussion_r1272430167



##########
go/iceberg/schema.go:
##########
@@ -0,0 +1,849 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg
+
+import (
+       "encoding/json"
+       "fmt"
+       "strings"
+       "sync/atomic"
+
+       "golang.org/x/exp/maps"
+       "golang.org/x/exp/slices"
+)
+
+// Schema is an Iceberg table schema, represented as a struct with
+// multiple fields. The fields are only exported via accessor methods
+// rather than exposing the slice directly in order to ensure a schema
+// as immutable.
+type Schema struct {
+       ID                 int   `json:"schema-id"`
+       IdentifierFieldIDs []int `json:"identifier-field-ids"`
+
+       fields []NestedField
+
+       // the following maps are lazily populated as needed.
+       // rather than have lock contention with a mutex, we can use
+       // atomic pointers to Store/Load the values.
+       idToName      atomic.Pointer[map[int]string]
+       idToField     atomic.Pointer[map[int]NestedField]
+       nameToID      atomic.Pointer[map[string]int]
+       nameToIDLower atomic.Pointer[map[string]int]
+}
+
+// NewSchema constructs a new schema with the provided ID
+// and list of fields.
+func NewSchema(id int, fields ...NestedField) *Schema {
+       return NewSchemaWithIdentifiers(id, []int{}, fields...)
+}
+
+// NewSchemaWithIdentifiers constructs a new schema with the provided ID
+// and fields, along with a slice of field IDs to be listed as identifier
+// fields.
+func NewSchemaWithIdentifiers(id int, identifierIDs []int, fields 
...NestedField) *Schema {
+       return &Schema{ID: id, fields: fields, IdentifierFieldIDs: 
identifierIDs}
+}
+
+func (s *Schema) lazyNameToID() (map[string]int, error) {
+       index := s.nameToID.Load()
+       if index != nil {
+               return *index, nil
+       }
+
+       idx, err := IndexByName(s)
+       if err != nil {
+               return nil, err
+       }
+
+       s.nameToID.Store(&idx)
+       return idx, nil
+}
+
+func (s *Schema) lazyIDToField() (map[int]NestedField, error) {
+       index := s.idToField.Load()
+       if index != nil {
+               return *index, nil
+       }
+
+       idx, err := IndexByID(s)
+       if err != nil {
+               return nil, err
+       }
+
+       s.idToField.Store(&idx)
+       return idx, nil
+}
+
+func (s *Schema) lazyIDToName() (map[int]string, error) {
+       index := s.idToName.Load()
+       if index != nil {
+               return *index, nil
+       }
+
+       idx, err := IndexNameByID(s)
+       if err != nil {
+               return nil, err
+       }
+
+       s.idToName.Store(&idx)
+       return idx, nil
+}
+
+func (s *Schema) lazyNameToIDLower() (map[string]int, error) {
+       index := s.nameToIDLower.Load()
+       if index != nil {
+               return *index, nil
+       }
+
+       idx, err := s.lazyNameToID()
+       if err != nil {
+               return nil, err
+       }
+
+       out := make(map[string]int)
+       for k, v := range idx {
+               out[strings.ToLower(k)] = v
+       }
+
+       s.nameToIDLower.Store(&out)
+       return out, nil
+}
+
+func (s *Schema) Type() string { return "struct" }
+
+// AsStruct returns a Struct with the same fields as the schema which can
+// then be used as a Type.
+func (s *Schema) AsStruct() StructType    { return StructType{Fields: 
s.fields} }
+func (s *Schema) NumFields() int          { return len(s.fields) }
+func (s *Schema) Field(i int) NestedField { return s.fields[i] }
+func (s *Schema) Children() []NestedField { return slices.Clone(s.fields) }
+
+func (s *Schema) UnmarshalJSON(b []byte) error {
+       type Alias Schema
+       aux := struct {
+               Fields []NestedField `json:"fields"`
+               *Alias
+       }{Alias: (*Alias)(s)}
+
+       if err := json.Unmarshal(b, &aux); err != nil {
+               return err
+       }
+
+       s.fields = aux.Fields
+       if s.IdentifierFieldIDs == nil {
+               s.IdentifierFieldIDs = []int{}
+       }
+       return nil
+}
+
+func (s *Schema) MarshalJSON() ([]byte, error) {
+       if s.IdentifierFieldIDs == nil {
+               s.IdentifierFieldIDs = []int{}
+       }
+
+       type Alias Schema
+       return json.Marshal(struct {
+               Type   string        `json:"type"`
+               Fields []NestedField `json:"fields"`
+               *Alias
+       }{Type: "struct", Fields: s.fields, Alias: (*Alias)(s)})
+}
+
+// FindColumnName returns the name of the column identified by the
+// passed in field id. The second return value reports whether or
+// not the field id was found in the schema.
+func (s *Schema) FindColumnName(fieldID int) (string, bool) {
+       idx, _ := s.lazyIDToName()
+       col, ok := idx[fieldID]
+       return col, ok
+}
+
+// FindFieldByName returns the field identified by the name given,
+// the second return value will be false if no field by this name
+// is found.
+//
+// Note: This search is done in a case sensitive manner. To perform
+// a case insensitive search, use [*Schema.FindFieldByNameCaseInsensitive].
+func (s *Schema) FindFieldByName(name string) (NestedField, bool) {
+       idx, _ := s.lazyNameToID()
+
+       id, ok := idx[name]
+       if !ok {
+               return NestedField{}, false
+       }
+
+       return s.FindFieldByID(id)
+}
+
+// FindFieldByNameCaseInsensitive is like [*Schema.FindFieldByName],
+// but performs a case insensitive search.
+func (s *Schema) FindFieldByNameCaseInsensitive(name string) (NestedField, 
bool) {
+       idx, _ := s.lazyNameToIDLower()
+
+       id, ok := idx[strings.ToLower(name)]
+       if !ok {
+               return NestedField{}, false
+       }
+
+       return s.FindFieldByID(id)
+}
+
+// FindFieldByID is like [*Schema.FindColumnByName], but returns the whole
+// field rather than just the field name.
+func (s *Schema) FindFieldByID(id int) (NestedField, bool) {
+       idx, _ := s.lazyIDToField()
+       f, ok := idx[id]
+       return f, ok
+}
+
+// FindTypeByID is like [*Schema.FindFieldByID], but returns only the data
+// type of the field.
+func (s *Schema) FindTypeByID(id int) (Type, bool) {
+       f, ok := s.FindFieldByID(id)
+       if !ok {
+               return nil, false
+       }
+
+       return f.Type, true
+}
+
+// FindTypeByName is a convenience function for calling 
[*Schema.FindFieldByName],
+// and then returning just the type.
+func (s *Schema) FindTypeByName(name string) (Type, bool) {
+       f, ok := s.FindFieldByName(name)
+       if !ok {
+               return nil, false
+       }
+
+       return f.Type, true
+}
+
+// FindTypeByNameCaseInsensitive is like [*Schema.FindTypeByName] but
+// performs a case insensitive search.
+func (s *Schema) FindTypeByNameCaseInsensitive(name string) (Type, bool) {
+       f, ok := s.FindFieldByNameCaseInsensitive(name)
+       if !ok {
+               return nil, false
+       }
+
+       return f.Type, true
+}
+
+// Equals compares the fields and identifierIDs, but does not compare
+// the schema ID itself.
+func (s *Schema) Equals(other *Schema) bool {
+       if other == nil {
+               return false
+       }
+
+       if s == other {
+               return true
+       }
+
+       if len(s.fields) != len(other.fields) {
+               return false
+       }
+
+       if !slices.Equal(s.IdentifierFieldIDs, other.IdentifierFieldIDs) {
+               return false
+       }
+
+       return slices.EqualFunc(s.fields, other.fields, func(a, b NestedField) 
bool {

Review Comment:
   `slices.EqualFunc` will break its loop when it encounters the first false.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] zeroshade commented on a diff in pull request #8122: Go: Schema and Types

Reply via email to