xixipi-lining commented on code in PR #596: URL: https://github.com/apache/iceberg-go/pull/596#discussion_r2455380769
########## table/update_schema.go: ########## @@ -0,0 +1,944 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package table + +import ( + "errors" + "fmt" + "maps" + "slices" + "strings" + + "github.com/apache/iceberg-go" +) + +const TableRootID = -1 + +type MoveOp string + +const ( + MoveOpFirst MoveOp = "first" + MoveOpBefore MoveOp = "before" + MoveOpAfter MoveOp = "after" +) + +type move struct { + FieldID int + RelativeTo int + Op MoveOp +} + +// UpdateSchema manages schema evolution operations within a transaction. +// It supports adding, deleting, renaming, updating, and reordering columns, +// and ensures all changes are validated before being committed. +// +// Operations can be chained together and are applied in the order they are called. +// Changes are not persisted until Commit() is called. +// +// Basic Usage: +// +// txn := table.NewTransaction() +// updateSchema := txn.UpdateSchema(true, false) +// +// // Add a new column +// updateSchema.AddColumn([]string{"email"}, iceberg.PrimitiveTypes.String, "Email address", false, nil) +// +// // Commit changes +// if err := updateSchema.Commit(); err != nil { +// return err +// } +// if _, err := txn.Commit(ctx); err != nil { +// return err +// } +// +// Chaining Operations: +// +// updateSchema. +// AddColumn([]string{"age"}, iceberg.PrimitiveTypes.Int, "User age", false, nil). +// RenameColumn([]string{"name"}, "full_name"). +// MoveFirst([]string{"id"}). +// Commit() +// +// Adding Nested Columns: +// +// // Add a column to a struct field +// updateSchema.AddColumn([]string{"address", "country"}, iceberg.PrimitiveTypes.String, "Country code", false, iceberg.StringLiteral("US")) +// +// // Commit the schema update +// if err := updateSchema.Commit(); err != nil { +// return err +// } +// if _, err := txn.Commit(ctx); err != nil { +// return err +// } +type UpdateSchema struct { + txn *Transaction + schema *iceberg.Schema + lastColumnID int + + deletes map[int]struct{} + updates map[int]map[int]iceberg.NestedField + adds map[int][]iceberg.NestedField + moves map[int][]move + + identifierFieldNames map[string]struct{} + parentID map[int]int + + addedNameToID map[string]int + allowIncompatibleChanges bool + caseSensitive bool + nameMapping iceberg.NameMapping + ops []func() error +} + +// UpdateSchemaOption is a functional option for configuring UpdateSchema. +type UpdateSchemaOption func(*UpdateSchema) + +// WithNameMapping configures the UpdateSchema to use the provided name mapping +// for tracking field name changes and ensuring consistency during schema evolution. +func WithNameMapping(nameMapping iceberg.NameMapping) UpdateSchemaOption { + return func(u *UpdateSchema) { + u.nameMapping = nameMapping + } +} + +// NewUpdateSchema creates a new UpdateSchema instance for managing schema changes +// within a transaction. +// +// Parameters: +// - txn: The transaction that this schema update will be applied to. +// - caseSensitive: If true, field name lookups are case-sensitive; if false, +// field names are matched case-insensitively. +// - allowIncompatibleChanges: If true, allows schema changes that would normally +// be rejected for being incompatible (e.g., adding required fields without +// default values, changing field types in non-promotable ways, or changing +// column nullability from optional to required). +// - opts: Optional configuration functions to customize the UpdateSchema behavior. +// +// Returns an UpdateSchema instance that can be used to build and apply schema changes. +func NewUpdateSchema(txn *Transaction, caseSensitive bool, allowIncompatibleChanges bool, opts ...UpdateSchemaOption) *UpdateSchema { + u := &UpdateSchema{ + txn: txn, + schema: nil, + lastColumnID: txn.meta.CurrentSchema().HighestFieldID(), + + deletes: make(map[int]struct{}), + updates: make(map[int]map[int]iceberg.NestedField), + adds: make(map[int][]iceberg.NestedField), + moves: make(map[int][]move), + + identifierFieldNames: nil, + parentID: make(map[int]int), + + addedNameToID: make(map[string]int), + allowIncompatibleChanges: allowIncompatibleChanges, + caseSensitive: caseSensitive, + nameMapping: nil, + ops: make([]func() error, 0), + } + + for _, opt := range opts { + opt(u) + } + + return u +} + +func (u *UpdateSchema) init() error { + if u.txn == nil { + return errors.New("transaction is nil") + } + if u.txn.meta == nil { + return errors.New("transaction meta is nil") + } + + u.schema = u.txn.meta.CurrentSchema() + if u.schema == nil { + return errors.New("current schema is nil") + } + + if err := u.initIdentifierFieldNames(); err != nil { + return err + } + + if err := u.initParentID(); err != nil { + return err + } + + return nil +} + +func (u *UpdateSchema) initIdentifierFieldNames() error { + if u.identifierFieldNames != nil { + return nil + } + + identifierFieldNames := make(map[string]struct{}) + for _, id := range u.schema.IdentifierFieldIDs { + name, ok := u.schema.FindColumnName(id) + if !ok { + return fmt.Errorf("identifier field %d not found", id) + } + identifierFieldNames[name] = struct{}{} + } + + u.identifierFieldNames = identifierFieldNames + + return nil +} + +func (u *UpdateSchema) initParentID() error { + parents, err := iceberg.IndexParents(u.schema) + if err != nil { + return err + } + + maps.Copy(u.parentID, parents) + + return nil +} + +func (u *UpdateSchema) assignNewColumnID() int { + u.lastColumnID++ + + return u.lastColumnID +} + +func (u *UpdateSchema) findField(name string) (iceberg.NestedField, bool) { + if u.caseSensitive { + return u.schema.FindFieldByName(name) + } else { + return u.schema.FindFieldByNameCaseInsensitive(name) + } +} + +func (u *UpdateSchema) isDeleted(fieldID int) bool { + _, ok := u.deletes[fieldID] + + return ok +} + +func (u *UpdateSchema) findParentID(fieldID int) int { + parentID, ok := u.parentID[fieldID] + if !ok { + return TableRootID + } + + return parentID +} + +func (u *UpdateSchema) AddColumn(path []string, fieldType iceberg.Type, doc string, required bool, defaultValue iceberg.Literal) *UpdateSchema { + u.ops = append(u.ops, func() error { + return u.addColumn(path, fieldType, doc, required, defaultValue) + }) + + return u +} + +func (u *UpdateSchema) addColumn(path []string, fieldType iceberg.Type, doc string, required bool, defaultValue iceberg.Literal) error { + if len(path) == 0 { + return errors.New("path is empty") + } + + fullName := strings.Join(path, ".") + + switch t := fieldType.(type) { + case *iceberg.ListType, *iceberg.MapType, *iceberg.StructType: + if defaultValue != nil { + return fmt.Errorf("default values are not supported for %s", t.String()) + } + case iceberg.PrimitiveType: + if required && defaultValue == nil && !u.allowIncompatibleChanges { + return fmt.Errorf("required field %s has no default value", fullName) + } + if defaultValue != nil && !defaultValue.Type().Equals(t) { + return fmt.Errorf("default value type mismatch: %s != %s", defaultValue.Type(), t) + } + default: + return fmt.Errorf("invalid field type: %T", t) + } + + parent := path[:len(path)-1] + parentID := TableRootID + + if len(parent) > 0 { + parentFullPath := strings.Join(parent, ".") + parentField, ok := u.findField(parentFullPath) + if !ok { + return fmt.Errorf("parent field not found: %s", parentFullPath) + } + + switch parentType := parentField.Type.(type) { + case *iceberg.ListType: + f := parentType.ElementField() + parentField = f + case *iceberg.MapType: + f := parentType.ValueField() + parentField = f + } + + if _, ok := parentField.Type.(*iceberg.StructType); !ok { + return fmt.Errorf("cannot add field to non-struct type: %s", parentFullPath) + } + + parentID = parentField.ID + } + + name := path[len(path)-1] + for _, add := range u.adds[parentID] { + if add.Name == name { + return fmt.Errorf("field already exists in adds: %s", fullName) + } + } + + // support add field with the same name as deleted field and renamed field + if field, ok := u.findField(fullName); ok { + if !u.isDeleted(field.ID) { + for _, upd := range u.updates[parentID] { + if upd.Name == name { + return fmt.Errorf("field already exists: %s", fullName) + } + } + } + } + + field := iceberg.NestedField{ + Name: name, + Type: fieldType, + Required: required, + Doc: doc, + } + if defaultValue != nil { + field.InitialDefault = defaultValue.Any() + field.WriteDefault = defaultValue.Any() + } + + sch, err := iceberg.AssignFreshSchemaIDs(iceberg.NewSchema(0, field), u.assignNewColumnID) + if err != nil { + return fmt.Errorf("failed to assign field id: %w", err) + } + u.adds[parentID] = append(u.adds[parentID], sch.Field(0)) + u.addedNameToID[fullName] = sch.Field(0).ID + + return nil +} + +func (u *UpdateSchema) DeleteColumn(path []string) *UpdateSchema { + u.ops = append(u.ops, func() error { + return u.deleteColumn(path) + }) + + return u +} + +func (u *UpdateSchema) deleteColumn(path []string) error { + fullName := strings.Join(path, ".") + field, ok := u.findField(fullName) + if !ok { + return fmt.Errorf("field not found: %s", fullName) + } + + if _, ok := u.adds[field.ID]; ok { + return fmt.Errorf("field that has additions cannot be deleted: %s", fullName) + } + + if _, ok := u.updates[field.ID]; ok { + return fmt.Errorf("field that has updates cannot be deleted: %s", fullName) + } + + delete(u.identifierFieldNames, fullName) + + u.deletes[field.ID] = struct{}{} + + return nil +} + +type ColumnUpdate struct { + Name iceberg.Optional[string] + FieldType iceberg.Optional[iceberg.Type] + Required iceberg.Optional[bool] + WriteDefault iceberg.Optional[iceberg.Literal] + Doc iceberg.Optional[string] +} + +func (u *UpdateSchema) UpdateColumn(path []string, update ColumnUpdate) *UpdateSchema { + u.ops = append(u.ops, func() error { + return u.updateColumn(path, update) + }) + + return u +} + +func (u *UpdateSchema) updateColumn(path []string, update ColumnUpdate) error { + if !update.Name.Valid && + !update.FieldType.Valid && + !update.Required.Valid && + !update.WriteDefault.Valid && + !update.Doc.Valid { + return nil + } + + fullName := strings.Join(path, ".") + + field, ok := u.findField(fullName) + if !ok { + return fmt.Errorf("field not found: %s", fullName) + } + + if u.isDeleted(field.ID) { + return fmt.Errorf("field that has been deleted cannot be updated: %s", fullName) + } + + parentID := u.findParentID(field.ID) + + if update.Name.Valid { + if update.Name.Val == "" { + return fmt.Errorf("cannot rename field to empty name: %s", fullName) + } + if field.Name == update.Name.Val { + return fmt.Errorf("cannot rename field to the same name: %s", fullName) + } + + newFullName := strings.Join(append(path[:len(path)-1], update.Name.Val), ".") + if existingField, ok := u.findField(newFullName); ok { + if !u.isDeleted(existingField.ID) { + return fmt.Errorf("field already exists: %s", newFullName) + } + } + + for _, add := range u.adds[parentID] { + if add.Name == update.Name.Val { + return fmt.Errorf("cannot rename field to added field: %s", newFullName) + } + } + + for _, upd := range u.updates[parentID] { + if upd.Name == update.Name.Val && upd.ID != field.ID { + return fmt.Errorf("cannot rename field to renamed field: %s", newFullName) + } + } + + if _, ok := u.identifierFieldNames[fullName]; ok { + delete(u.identifierFieldNames, fullName) + u.identifierFieldNames[newFullName] = struct{}{} + } + } + + if update.FieldType.Valid { + if _, ok := field.Type.(iceberg.PrimitiveType); !ok { + return fmt.Errorf("cannot update field type for non-primitive type: %s", fullName) + } + if !update.FieldType.Val.Equals(field.Type) && !u.allowIncompatibleChanges { + fieldType, err := iceberg.PromoteType(field.Type, update.FieldType.Val) + if err != nil { + return err + } + update.FieldType.Val = fieldType + } + } + + if update.Required.Valid { + if field.Required != update.Required.Val { + if !u.allowIncompatibleChanges && update.Required.Val { + return fmt.Errorf("cannot change column nullability from optional to required: %s", fullName) + } + } + } + + if update.WriteDefault.Valid { + if update.WriteDefault.Val == nil { + if field.Required && !u.allowIncompatibleChanges { + return fmt.Errorf("cannot change default value of required column to nil: %s", fullName) + } + } + } + + if _, ok := u.updates[parentID]; !ok { + u.updates[parentID] = make(map[int]iceberg.NestedField) + } + + updatedField, ok := u.updates[parentID][field.ID] + if !ok { + updatedField = field + } + if update.Name.Valid { + updatedField.Name = update.Name.Val + } + if update.FieldType.Valid { + updatedField.Type = update.FieldType.Val + } + if update.Required.Valid { + updatedField.Required = update.Required.Val + } + if update.WriteDefault.Valid { + updatedField.WriteDefault = update.WriteDefault.Val.Any() + } + if update.Doc.Valid { + updatedField.Doc = update.Doc.Val + } + u.updates[parentID][field.ID] = updatedField + + return nil +} + +func (u *UpdateSchema) RenameColumn(path []string, newName string) *UpdateSchema { + u.ops = append(u.ops, func() error { + return u.updateColumn(path, ColumnUpdate{ + Name: iceberg.Optional[string]{ + Valid: true, + Val: newName, + }, + }) + }) + + return u +} + +func (u *UpdateSchema) MoveColumn(op MoveOp, path, relativeTo []string) *UpdateSchema { + u.ops = append(u.ops, func() error { + return u.moveColumn(op, path, relativeTo) + }) + + return u +} + +func (u *UpdateSchema) MoveFirst(path []string) *UpdateSchema { + u.ops = append(u.ops, func() error { + return u.moveColumn(MoveOpFirst, path, nil) + }) + + return u +} + +func (u *UpdateSchema) MoveBefore(path, relativeTo []string) *UpdateSchema { + u.ops = append(u.ops, func() error { + return u.moveColumn(MoveOpBefore, path, relativeTo) + }) + + return u +} + +func (u *UpdateSchema) MoveAfter(path, relativeTo []string) *UpdateSchema { + u.ops = append(u.ops, func() error { + return u.moveColumn(MoveOpAfter, path, relativeTo) + }) + + return u +} + +func (u *UpdateSchema) findFieldForMove(name string) (int, bool) { + field, ok := u.findField(name) + if ok { + return field.ID, true + } + id, ok := u.addedNameToID[name] + + return id, ok +} + +func (u *UpdateSchema) moveColumn(op MoveOp, path []string, relativeTo []string) error { + fullName := strings.Join(path, ".") + fieldID, ok := u.findFieldForMove(fullName) + if !ok { + return fmt.Errorf("field not found: %s", fullName) + } + + if u.isDeleted(fieldID) { + return fmt.Errorf("field that has been deleted cannot be moved: %s", fullName) + } + + parentID := u.findParentID(fieldID) + + switch op { + case MoveOpFirst: + u.moves[parentID] = append(u.moves[parentID], move{ + FieldID: fieldID, + RelativeTo: -1, + Op: op, + }) + + return nil + case MoveOpBefore, MoveOpAfter: + relativeToFullName := strings.Join(relativeTo, ".") + relativeToFieldID, ok := u.findFieldForMove(relativeToFullName) + if !ok { + return fmt.Errorf("relative to field not found: %s", relativeToFullName) + } + + if relativeToFieldID == fieldID { + return fmt.Errorf("cannot move a field to itself: %s", fullName) + } + + if u.findParentID(relativeToFieldID) != parentID { + return fmt.Errorf("relative to field is not a child of the parent: %s", relativeToFullName) + } + u.moves[parentID] = append(u.moves[parentID], move{ + FieldID: fieldID, + RelativeTo: relativeToFieldID, + Op: op, + }) + + return nil + default: + + return fmt.Errorf("invalid move operation: %s", op) + } +} + +func (u *UpdateSchema) SetIdentifierField(paths [][]string) *UpdateSchema { + identifierFieldNames := make(map[string]struct{}) + for _, path := range paths { + identifierFieldNames[strings.Join(path, ".")] = struct{}{} + } + u.identifierFieldNames = identifierFieldNames + + return u +} + +func (u *UpdateSchema) BuildUpdates() ([]Update, []Requirement, error) { + newSchema, err := u.Apply() + if err != nil { + return nil, nil, err + } + + existingSchemaID := -1 + for _, schema := range u.txn.meta.schemaList { + if newSchema.Equals(schema) { + existingSchemaID = schema.ID + + break + } + } + + requirements := make([]Requirement, 0) + updates := make([]Update, 0) + + if existingSchemaID != u.schema.ID { + requirements = append(requirements, AssertCurrentSchemaID(u.schema.ID)) + if existingSchemaID == -1 { + updates = append( + updates, + NewAddSchemaUpdate(newSchema), + NewSetCurrentSchemaUpdate(newSchema.ID), + ) + } else { + updates = append(updates, + NewSetCurrentSchemaUpdate(newSchema.ID), + ) + } + + if u.nameMapping != nil { + updatesMap := make(map[int]iceberg.NestedField) + for _, upds := range u.updates { + maps.Copy(updatesMap, upds) + } + updatedNameMapping, err := iceberg.UpdateNameMapping(u.nameMapping, updatesMap, u.adds) + if err != nil { + return nil, nil, err + } + updates = append(updates, NewSetPropertiesUpdate(iceberg.Properties{ + DefaultNameMappingKey: updatedNameMapping.String(), + })) + } + } + + return updates, requirements, nil +} + +func (u *UpdateSchema) Apply() (*iceberg.Schema, error) { + if err := u.init(); err != nil { + return nil, err + } + + for _, op := range u.ops { + if err := op(); err != nil { + return nil, err + } + } + + updates := make(map[int]iceberg.NestedField) + for _, upds := range u.updates { + maps.Copy(updates, upds) + } + st, err := iceberg.Visit(u.schema, &applyChanges{ + adds: u.adds, + updates: updates, + deletes: u.deletes, + moves: u.moves, + }) + if err != nil { + return nil, fmt.Errorf("error applying schema changes: %w", err) + } + + identifierFieldIDs := make([]int, 0) + newSchema := iceberg.NewSchema(0, st.(*iceberg.StructType).FieldList...) + for name := range u.identifierFieldNames { + var field iceberg.NestedField + var ok bool + if u.caseSensitive { + field, ok = newSchema.FindFieldByName(name) + } else { + field, ok = newSchema.FindFieldByNameCaseInsensitive(name) + } + if !ok { + return nil, fmt.Errorf("identifier field not found: %s", name) + } + identifierFieldIDs = append(identifierFieldIDs, field.ID) + } + + nextSchemaID := 1 + if len(u.txn.meta.schemaList) > 0 { + nextSchemaID = 1 + slices.MaxFunc(u.txn.meta.schemaList, func(a, b *iceberg.Schema) int { + return a.ID - b.ID + }).ID + } + + return iceberg.NewSchemaWithIdentifiers(nextSchemaID, identifierFieldIDs, st.(*iceberg.StructType).FieldList...), nil +} + +func (u *UpdateSchema) Commit() error { + updates, requirements, err := u.BuildUpdates() + if err != nil { + return err + } + if len(updates) == 0 { + return nil + } + + return u.txn.apply(updates, requirements) +} Review Comment: I see your point, though I'd probably lean towards keeping `Commit()` for consistency with `UpdateSpec.Commit()`. Both follow the same pattern of committing changes to the transaction rather than the catalog, so it creates a two-level commit model: - `UpdateSchema.Commit()` → to transaction - `Transaction.Commit()` → to catalog Also, `Apply()` is already used internally to return the new schema object (called in `BuildUpdates()`). But I'm open to other suggestions if you feel strongly about it! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
