Re: [PR] feat(table): add schema evolution support [iceberg-go]

via GitHub Thu, 23 Oct 2025 07:39:16 -0700


xixipi-lining commented on code in PR #596:
URL: https://github.com/apache/iceberg-go/pull/596#discussion_r2455380769



##########
table/update_schema.go:
##########
@@ -0,0 +1,944 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package table
+
+import (
+       "errors"
+       "fmt"
+       "maps"
+       "slices"
+       "strings"
+
+       "github.com/apache/iceberg-go"
+)
+
+const TableRootID = -1
+
+type MoveOp string
+
+const (
+       MoveOpFirst  MoveOp = "first"
+       MoveOpBefore MoveOp = "before"
+       MoveOpAfter  MoveOp = "after"
+)
+
+type move struct {
+       FieldID    int
+       RelativeTo int
+       Op         MoveOp
+}
+
+// UpdateSchema manages schema evolution operations within a transaction.
+// It supports adding, deleting, renaming, updating, and reordering columns,
+// and ensures all changes are validated before being committed.
+//
+// Operations can be chained together and are applied in the order they are 
called.
+// Changes are not persisted until Commit() is called.
+//
+// Basic Usage:
+//
+//     txn := table.NewTransaction()
+//     updateSchema := txn.UpdateSchema(true, false)
+//
+//     // Add a new column
+//     updateSchema.AddColumn([]string{"email"}, 
iceberg.PrimitiveTypes.String, "Email address", false, nil)
+//
+//     // Commit changes
+//     if err := updateSchema.Commit(); err != nil {
+//         return err
+//     }
+//     if _, err := txn.Commit(ctx); err != nil {
+//         return err
+//     }
+//
+// Chaining Operations:
+//
+//     updateSchema.
+//         AddColumn([]string{"age"}, iceberg.PrimitiveTypes.Int, "User age", 
false, nil).
+//         RenameColumn([]string{"name"}, "full_name").
+//         MoveFirst([]string{"id"}).
+//         Commit()
+//
+// Adding Nested Columns:
+//
+//     // Add a column to a struct field
+//     updateSchema.AddColumn([]string{"address", "country"}, 
iceberg.PrimitiveTypes.String, "Country code", false, 
iceberg.StringLiteral("US"))
+//
+//     // Commit the schema update
+//     if err := updateSchema.Commit(); err != nil {
+//         return err
+//     }
+//     if _, err := txn.Commit(ctx); err != nil {
+//         return err
+//     }
+type UpdateSchema struct {
+       txn          *Transaction
+       schema       *iceberg.Schema
+       lastColumnID int
+
+       deletes map[int]struct{}
+       updates map[int]map[int]iceberg.NestedField
+       adds    map[int][]iceberg.NestedField
+       moves   map[int][]move
+
+       identifierFieldNames map[string]struct{}
+       parentID             map[int]int
+
+       addedNameToID            map[string]int
+       allowIncompatibleChanges bool
+       caseSensitive            bool
+       nameMapping              iceberg.NameMapping
+       ops                      []func() error
+}
+
+// UpdateSchemaOption is a functional option for configuring UpdateSchema.
+type UpdateSchemaOption func(*UpdateSchema)
+
+// WithNameMapping configures the UpdateSchema to use the provided name mapping
+// for tracking field name changes and ensuring consistency during schema 
evolution.
+func WithNameMapping(nameMapping iceberg.NameMapping) UpdateSchemaOption {
+       return func(u *UpdateSchema) {
+               u.nameMapping = nameMapping
+       }
+}
+
+// NewUpdateSchema creates a new UpdateSchema instance for managing schema 
changes
+// within a transaction.
+//
+// Parameters:
+//   - txn: The transaction that this schema update will be applied to.
+//   - caseSensitive: If true, field name lookups are case-sensitive; if false,
+//     field names are matched case-insensitively.
+//   - allowIncompatibleChanges: If true, allows schema changes that would 
normally
+//     be rejected for being incompatible (e.g., adding required fields without
+//     default values, changing field types in non-promotable ways, or changing
+//     column nullability from optional to required).
+//   - opts: Optional configuration functions to customize the UpdateSchema 
behavior.
+//
+// Returns an UpdateSchema instance that can be used to build and apply schema 
changes.
+func NewUpdateSchema(txn *Transaction, caseSensitive bool, 
allowIncompatibleChanges bool, opts ...UpdateSchemaOption) *UpdateSchema {
+       u := &UpdateSchema{
+               txn:          txn,
+               schema:       nil,
+               lastColumnID: txn.meta.CurrentSchema().HighestFieldID(),
+
+               deletes: make(map[int]struct{}),
+               updates: make(map[int]map[int]iceberg.NestedField),
+               adds:    make(map[int][]iceberg.NestedField),
+               moves:   make(map[int][]move),
+
+               identifierFieldNames: nil,
+               parentID:             make(map[int]int),
+
+               addedNameToID:            make(map[string]int),
+               allowIncompatibleChanges: allowIncompatibleChanges,
+               caseSensitive:            caseSensitive,
+               nameMapping:              nil,
+               ops:                      make([]func() error, 0),
+       }
+
+       for _, opt := range opts {
+               opt(u)
+       }
+
+       return u
+}
+
+func (u *UpdateSchema) init() error {
+       if u.txn == nil {
+               return errors.New("transaction is nil")
+       }
+       if u.txn.meta == nil {
+               return errors.New("transaction meta is nil")
+       }
+
+       u.schema = u.txn.meta.CurrentSchema()
+       if u.schema == nil {
+               return errors.New("current schema is nil")
+       }
+
+       if err := u.initIdentifierFieldNames(); err != nil {
+               return err
+       }
+
+       if err := u.initParentID(); err != nil {
+               return err
+       }
+
+       return nil
+}
+
+func (u *UpdateSchema) initIdentifierFieldNames() error {
+       if u.identifierFieldNames != nil {
+               return nil
+       }
+
+       identifierFieldNames := make(map[string]struct{})
+       for _, id := range u.schema.IdentifierFieldIDs {
+               name, ok := u.schema.FindColumnName(id)
+               if !ok {
+                       return fmt.Errorf("identifier field %d not found", id)
+               }
+               identifierFieldNames[name] = struct{}{}
+       }
+
+       u.identifierFieldNames = identifierFieldNames
+
+       return nil
+}
+
+func (u *UpdateSchema) initParentID() error {
+       parents, err := iceberg.IndexParents(u.schema)
+       if err != nil {
+               return err
+       }
+
+       maps.Copy(u.parentID, parents)
+
+       return nil
+}
+
+func (u *UpdateSchema) assignNewColumnID() int {
+       u.lastColumnID++
+
+       return u.lastColumnID
+}
+
+func (u *UpdateSchema) findField(name string) (iceberg.NestedField, bool) {
+       if u.caseSensitive {
+               return u.schema.FindFieldByName(name)
+       } else {
+               return u.schema.FindFieldByNameCaseInsensitive(name)
+       }
+}
+
+func (u *UpdateSchema) isDeleted(fieldID int) bool {
+       _, ok := u.deletes[fieldID]
+
+       return ok
+}
+
+func (u *UpdateSchema) findParentID(fieldID int) int {
+       parentID, ok := u.parentID[fieldID]
+       if !ok {
+               return TableRootID
+       }
+
+       return parentID
+}
+
+func (u *UpdateSchema) AddColumn(path []string, fieldType iceberg.Type, doc 
string, required bool, defaultValue iceberg.Literal) *UpdateSchema {
+       u.ops = append(u.ops, func() error {
+               return u.addColumn(path, fieldType, doc, required, defaultValue)
+       })
+
+       return u
+}
+
+func (u *UpdateSchema) addColumn(path []string, fieldType iceberg.Type, doc 
string, required bool, defaultValue iceberg.Literal) error {
+       if len(path) == 0 {
+               return errors.New("path is empty")
+       }
+
+       fullName := strings.Join(path, ".")
+
+       switch t := fieldType.(type) {
+       case *iceberg.ListType, *iceberg.MapType, *iceberg.StructType:
+               if defaultValue != nil {
+                       return fmt.Errorf("default values are not supported for 
%s", t.String())
+               }
+       case iceberg.PrimitiveType:
+               if required && defaultValue == nil && 
!u.allowIncompatibleChanges {
+                       return fmt.Errorf("required field %s has no default 
value", fullName)
+               }
+               if defaultValue != nil && !defaultValue.Type().Equals(t) {
+                       return fmt.Errorf("default value type mismatch: %s != 
%s", defaultValue.Type(), t)
+               }
+       default:
+               return fmt.Errorf("invalid field type: %T", t)
+       }
+
+       parent := path[:len(path)-1]
+       parentID := TableRootID
+
+       if len(parent) > 0 {
+               parentFullPath := strings.Join(parent, ".")
+               parentField, ok := u.findField(parentFullPath)
+               if !ok {
+                       return fmt.Errorf("parent field not found: %s", 
parentFullPath)
+               }
+
+               switch parentType := parentField.Type.(type) {
+               case *iceberg.ListType:
+                       f := parentType.ElementField()
+                       parentField = f
+               case *iceberg.MapType:
+                       f := parentType.ValueField()
+                       parentField = f
+               }
+
+               if _, ok := parentField.Type.(*iceberg.StructType); !ok {
+                       return fmt.Errorf("cannot add field to non-struct type: 
%s", parentFullPath)
+               }
+
+               parentID = parentField.ID
+       }
+
+       name := path[len(path)-1]
+       for _, add := range u.adds[parentID] {
+               if add.Name == name {
+                       return fmt.Errorf("field already exists in adds: %s", 
fullName)
+               }
+       }
+
+       // support add field with the same name as deleted field and renamed 
field
+       if field, ok := u.findField(fullName); ok {
+               if !u.isDeleted(field.ID) {
+                       for _, upd := range u.updates[parentID] {
+                               if upd.Name == name {
+                                       return fmt.Errorf("field already 
exists: %s", fullName)
+                               }
+                       }
+               }
+       }
+
+       field := iceberg.NestedField{
+               Name:     name,
+               Type:     fieldType,
+               Required: required,
+               Doc:      doc,
+       }
+       if defaultValue != nil {
+               field.InitialDefault = defaultValue.Any()
+               field.WriteDefault = defaultValue.Any()
+       }
+
+       sch, err := iceberg.AssignFreshSchemaIDs(iceberg.NewSchema(0, field), 
u.assignNewColumnID)
+       if err != nil {
+               return fmt.Errorf("failed to assign field id: %w", err)
+       }
+       u.adds[parentID] = append(u.adds[parentID], sch.Field(0))
+       u.addedNameToID[fullName] = sch.Field(0).ID
+
+       return nil
+}
+
+func (u *UpdateSchema) DeleteColumn(path []string) *UpdateSchema {
+       u.ops = append(u.ops, func() error {
+               return u.deleteColumn(path)
+       })
+
+       return u
+}
+
+func (u *UpdateSchema) deleteColumn(path []string) error {
+       fullName := strings.Join(path, ".")
+       field, ok := u.findField(fullName)
+       if !ok {
+               return fmt.Errorf("field not found: %s", fullName)
+       }
+
+       if _, ok := u.adds[field.ID]; ok {
+               return fmt.Errorf("field that has additions cannot be deleted: 
%s", fullName)
+       }
+
+       if _, ok := u.updates[field.ID]; ok {
+               return fmt.Errorf("field that has updates cannot be deleted: 
%s", fullName)
+       }
+
+       delete(u.identifierFieldNames, fullName)
+
+       u.deletes[field.ID] = struct{}{}
+
+       return nil
+}
+
+type ColumnUpdate struct {
+       Name         iceberg.Optional[string]
+       FieldType    iceberg.Optional[iceberg.Type]
+       Required     iceberg.Optional[bool]
+       WriteDefault iceberg.Optional[iceberg.Literal]
+       Doc          iceberg.Optional[string]
+}
+
+func (u *UpdateSchema) UpdateColumn(path []string, update ColumnUpdate) 
*UpdateSchema {
+       u.ops = append(u.ops, func() error {
+               return u.updateColumn(path, update)
+       })
+
+       return u
+}
+
+func (u *UpdateSchema) updateColumn(path []string, update ColumnUpdate) error {
+       if !update.Name.Valid &&
+               !update.FieldType.Valid &&
+               !update.Required.Valid &&
+               !update.WriteDefault.Valid &&
+               !update.Doc.Valid {
+               return nil
+       }
+
+       fullName := strings.Join(path, ".")
+
+       field, ok := u.findField(fullName)
+       if !ok {
+               return fmt.Errorf("field not found: %s", fullName)
+       }
+
+       if u.isDeleted(field.ID) {
+               return fmt.Errorf("field that has been deleted cannot be 
updated: %s", fullName)
+       }
+
+       parentID := u.findParentID(field.ID)
+
+       if update.Name.Valid {
+               if update.Name.Val == "" {
+                       return fmt.Errorf("cannot rename field to empty name: 
%s", fullName)
+               }
+               if field.Name == update.Name.Val {
+                       return fmt.Errorf("cannot rename field to the same 
name: %s", fullName)
+               }
+
+               newFullName := strings.Join(append(path[:len(path)-1], 
update.Name.Val), ".")
+               if existingField, ok := u.findField(newFullName); ok {
+                       if !u.isDeleted(existingField.ID) {
+                               return fmt.Errorf("field already exists: %s", 
newFullName)
+                       }
+               }
+
+               for _, add := range u.adds[parentID] {
+                       if add.Name == update.Name.Val {
+                               return fmt.Errorf("cannot rename field to added 
field: %s", newFullName)
+                       }
+               }
+
+               for _, upd := range u.updates[parentID] {
+                       if upd.Name == update.Name.Val && upd.ID != field.ID {
+                               return fmt.Errorf("cannot rename field to 
renamed field: %s", newFullName)
+                       }
+               }
+
+               if _, ok := u.identifierFieldNames[fullName]; ok {
+                       delete(u.identifierFieldNames, fullName)
+                       u.identifierFieldNames[newFullName] = struct{}{}
+               }
+       }
+
+       if update.FieldType.Valid {
+               if _, ok := field.Type.(iceberg.PrimitiveType); !ok {
+                       return fmt.Errorf("cannot update field type for 
non-primitive type: %s", fullName)
+               }
+               if !update.FieldType.Val.Equals(field.Type) && 
!u.allowIncompatibleChanges {
+                       fieldType, err := iceberg.PromoteType(field.Type, 
update.FieldType.Val)
+                       if err != nil {
+                               return err
+                       }
+                       update.FieldType.Val = fieldType
+               }
+       }
+
+       if update.Required.Valid {
+               if field.Required != update.Required.Val {
+                       if !u.allowIncompatibleChanges && update.Required.Val {
+                               return fmt.Errorf("cannot change column 
nullability from optional to required: %s", fullName)
+                       }
+               }
+       }
+
+       if update.WriteDefault.Valid {
+               if update.WriteDefault.Val == nil {
+                       if field.Required && !u.allowIncompatibleChanges {
+                               return fmt.Errorf("cannot change default value 
of required column to nil: %s", fullName)
+                       }
+               }
+       }
+
+       if _, ok := u.updates[parentID]; !ok {
+               u.updates[parentID] = make(map[int]iceberg.NestedField)
+       }
+
+       updatedField, ok := u.updates[parentID][field.ID]
+       if !ok {
+               updatedField = field
+       }
+       if update.Name.Valid {
+               updatedField.Name = update.Name.Val
+       }
+       if update.FieldType.Valid {
+               updatedField.Type = update.FieldType.Val
+       }
+       if update.Required.Valid {
+               updatedField.Required = update.Required.Val
+       }
+       if update.WriteDefault.Valid {
+               updatedField.WriteDefault = update.WriteDefault.Val.Any()
+       }
+       if update.Doc.Valid {
+               updatedField.Doc = update.Doc.Val
+       }
+       u.updates[parentID][field.ID] = updatedField
+
+       return nil
+}
+
+func (u *UpdateSchema) RenameColumn(path []string, newName string) 
*UpdateSchema {
+       u.ops = append(u.ops, func() error {
+               return u.updateColumn(path, ColumnUpdate{
+                       Name: iceberg.Optional[string]{
+                               Valid: true,
+                               Val:   newName,
+                       },
+               })
+       })
+
+       return u
+}
+
+func (u *UpdateSchema) MoveColumn(op MoveOp, path, relativeTo []string) 
*UpdateSchema {
+       u.ops = append(u.ops, func() error {
+               return u.moveColumn(op, path, relativeTo)
+       })
+
+       return u
+}
+
+func (u *UpdateSchema) MoveFirst(path []string) *UpdateSchema {
+       u.ops = append(u.ops, func() error {
+               return u.moveColumn(MoveOpFirst, path, nil)
+       })
+
+       return u
+}
+
+func (u *UpdateSchema) MoveBefore(path, relativeTo []string) *UpdateSchema {
+       u.ops = append(u.ops, func() error {
+               return u.moveColumn(MoveOpBefore, path, relativeTo)
+       })
+
+       return u
+}
+
+func (u *UpdateSchema) MoveAfter(path, relativeTo []string) *UpdateSchema {
+       u.ops = append(u.ops, func() error {
+               return u.moveColumn(MoveOpAfter, path, relativeTo)
+       })
+
+       return u
+}
+
+func (u *UpdateSchema) findFieldForMove(name string) (int, bool) {
+       field, ok := u.findField(name)
+       if ok {
+               return field.ID, true
+       }
+       id, ok := u.addedNameToID[name]
+
+       return id, ok
+}
+
+func (u *UpdateSchema) moveColumn(op MoveOp, path []string, relativeTo 
[]string) error {
+       fullName := strings.Join(path, ".")
+       fieldID, ok := u.findFieldForMove(fullName)
+       if !ok {
+               return fmt.Errorf("field not found: %s", fullName)
+       }
+
+       if u.isDeleted(fieldID) {
+               return fmt.Errorf("field that has been deleted cannot be moved: 
%s", fullName)
+       }
+
+       parentID := u.findParentID(fieldID)
+
+       switch op {
+       case MoveOpFirst:
+               u.moves[parentID] = append(u.moves[parentID], move{
+                       FieldID:    fieldID,
+                       RelativeTo: -1,
+                       Op:         op,
+               })
+
+               return nil
+       case MoveOpBefore, MoveOpAfter:
+               relativeToFullName := strings.Join(relativeTo, ".")
+               relativeToFieldID, ok := u.findFieldForMove(relativeToFullName)
+               if !ok {
+                       return fmt.Errorf("relative to field not found: %s", 
relativeToFullName)
+               }
+
+               if relativeToFieldID == fieldID {
+                       return fmt.Errorf("cannot move a field to itself: %s", 
fullName)
+               }
+
+               if u.findParentID(relativeToFieldID) != parentID {
+                       return fmt.Errorf("relative to field is not a child of 
the parent: %s", relativeToFullName)
+               }
+               u.moves[parentID] = append(u.moves[parentID], move{
+                       FieldID:    fieldID,
+                       RelativeTo: relativeToFieldID,
+                       Op:         op,
+               })
+
+               return nil
+       default:
+
+               return fmt.Errorf("invalid move operation: %s", op)
+       }
+}
+
+func (u *UpdateSchema) SetIdentifierField(paths [][]string) *UpdateSchema {
+       identifierFieldNames := make(map[string]struct{})
+       for _, path := range paths {
+               identifierFieldNames[strings.Join(path, ".")] = struct{}{}
+       }
+       u.identifierFieldNames = identifierFieldNames
+
+       return u
+}
+
+func (u *UpdateSchema) BuildUpdates() ([]Update, []Requirement, error) {
+       newSchema, err := u.Apply()
+       if err != nil {
+               return nil, nil, err
+       }
+
+       existingSchemaID := -1
+       for _, schema := range u.txn.meta.schemaList {
+               if newSchema.Equals(schema) {
+                       existingSchemaID = schema.ID
+
+                       break
+               }
+       }
+
+       requirements := make([]Requirement, 0)
+       updates := make([]Update, 0)
+
+       if existingSchemaID != u.schema.ID {
+               requirements = append(requirements, 
AssertCurrentSchemaID(u.schema.ID))
+               if existingSchemaID == -1 {
+                       updates = append(
+                               updates,
+                               NewAddSchemaUpdate(newSchema),
+                               NewSetCurrentSchemaUpdate(newSchema.ID),
+                       )
+               } else {
+                       updates = append(updates,
+                               NewSetCurrentSchemaUpdate(newSchema.ID),
+                       )
+               }
+
+               if u.nameMapping != nil {
+                       updatesMap := make(map[int]iceberg.NestedField)
+                       for _, upds := range u.updates {
+                               maps.Copy(updatesMap, upds)
+                       }
+                       updatedNameMapping, err := 
iceberg.UpdateNameMapping(u.nameMapping, updatesMap, u.adds)
+                       if err != nil {
+                               return nil, nil, err
+                       }
+                       updates = append(updates, 
NewSetPropertiesUpdate(iceberg.Properties{
+                               DefaultNameMappingKey: 
updatedNameMapping.String(),
+                       }))
+               }
+       }
+
+       return updates, requirements, nil
+}
+
+func (u *UpdateSchema) Apply() (*iceberg.Schema, error) {
+       if err := u.init(); err != nil {
+               return nil, err
+       }
+
+       for _, op := range u.ops {
+               if err := op(); err != nil {
+                       return nil, err
+               }
+       }
+
+       updates := make(map[int]iceberg.NestedField)
+       for _, upds := range u.updates {
+               maps.Copy(updates, upds)
+       }
+       st, err := iceberg.Visit(u.schema, &applyChanges{
+               adds:    u.adds,
+               updates: updates,
+               deletes: u.deletes,
+               moves:   u.moves,
+       })
+       if err != nil {
+               return nil, fmt.Errorf("error applying schema changes: %w", err)
+       }
+
+       identifierFieldIDs := make([]int, 0)
+       newSchema := iceberg.NewSchema(0, st.(*iceberg.StructType).FieldList...)
+       for name := range u.identifierFieldNames {
+               var field iceberg.NestedField
+               var ok bool
+               if u.caseSensitive {
+                       field, ok = newSchema.FindFieldByName(name)
+               } else {
+                       field, ok = 
newSchema.FindFieldByNameCaseInsensitive(name)
+               }
+               if !ok {
+                       return nil, fmt.Errorf("identifier field not found: 
%s", name)
+               }
+               identifierFieldIDs = append(identifierFieldIDs, field.ID)
+       }
+
+       nextSchemaID := 1
+       if len(u.txn.meta.schemaList) > 0 {
+               nextSchemaID = 1 + slices.MaxFunc(u.txn.meta.schemaList, 
func(a, b *iceberg.Schema) int {
+                       return a.ID - b.ID
+               }).ID
+       }
+
+       return iceberg.NewSchemaWithIdentifiers(nextSchemaID, 
identifierFieldIDs, st.(*iceberg.StructType).FieldList...), nil
+}
+
+func (u *UpdateSchema) Commit() error {
+       updates, requirements, err := u.BuildUpdates()
+       if err != nil {
+               return err
+       }
+       if len(updates) == 0 {
+               return nil
+       }
+
+       return u.txn.apply(updates, requirements)
+}

Review Comment:
   I see your point, though I'd probably lean towards keeping `Commit()` for 
consistency with `UpdateSpec.Commit()`. 
   
   Both follow the same pattern of committing changes to the transaction rather 
than the catalog, so it creates a two-level commit model:
   - `UpdateSchema.Commit()` → to transaction  
   - `Transaction.Commit()` → to catalog
   
   Also, `Apply()` is already used internally to return the new schema object 
(called in `BuildUpdates()`).
   
   But I'm open to other suggestions if you feel strongly about it!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat(table): add schema evolution support [iceberg-go]

Reply via email to