gty404 commented on code in PR #460: URL: https://github.com/apache/iceberg-cpp/pull/460#discussion_r2654700737
########## src/iceberg/update/update_schema.h: ########## @@ -0,0 +1,451 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/update/update_schema.h +/// API for schema evolution. + +#include <memory> +#include <optional> +#include <string> +#include <string_view> +#include <unordered_set> +#include <vector> + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" +#include "iceberg/update/pending_update.h" + +namespace iceberg { + +/// \brief API for schema evolution. +/// +/// When committing, these changes will be applied to the current table metadata. +/// Commit conflicts will not be resolved and will result in a CommitFailed error. +class ICEBERG_EXPORT UpdateSchema : public PendingUpdate { + public: + static Result<std::shared_ptr<UpdateSchema>> Make( + std::shared_ptr<Transaction> transaction); + + ~UpdateSchema() override; + + /// \brief Allow incompatible changes to the schema. + /// + /// Incompatible changes can cause failures when attempting to read older data files. + /// For example, adding a required column and attempting to read data files without + /// that column will cause a failure. However, if there are no data files that are + /// not compatible with the change, it can be allowed. + /// + /// This option allows incompatible changes to be made to a schema. This should be + /// used when the caller has validated that the change will not break. For example, + /// if a column is added as optional but always populated and data older than the + /// column addition has been deleted from the table, this can be used with + /// RequireColumn() to mark the column required. + /// + /// \return Reference to this for method chaining. + UpdateSchema& AllowIncompatibleChanges(); + + /// \brief Add a new optional top-level column. + /// + /// Because "." may be interpreted as a column path separator or may be used in + /// field names, it is not allowed in names passed to this method. To add to nested + /// structures or to add fields with names that contain ".", use AddColumn(parent, + /// name, type, ...). + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// The added column will be optional with a null default value. + /// + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \return Reference to this for method chaining. + /// \throws InvalidArgument If name contains ".". + UpdateSchema& AddColumn(std::string_view name, std::shared_ptr<Type> type); + + /// \brief Add a new optional top-level column with documentation. + /// + /// Because "." may be interpreted as a column path separator or may be used in + /// field names, it is not allowed in names passed to this method. To add to nested + /// structures or to add fields with names that contain ".", use AddColumn(parent, + /// name, type, doc). + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// The added column will be optional with a null default value. + /// + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \param doc Documentation string for the new column. + /// \return Reference to this for method chaining. + /// \throws InvalidArgument If name contains ".". + UpdateSchema& AddColumn(std::string_view name, std::shared_ptr<Type> type, + std::string_view doc); + + /// \brief Add a new optional column to a nested struct. + /// + /// The parent name is used to find the parent using Schema::FindFieldByName(). If + /// the parent name is null or empty, the new column will be added to the root as a + /// top-level column. If parent identifies a struct, a new column is added to that + /// struct. If it identifies a list, the column is added to the list element struct, + /// and if it identifies a map, the new column is added to the map's value struct. + /// + /// The given name is used to name the new column and names containing "." are not + /// handled differently. + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// The added column will be optional with a null default value. + /// + /// \param parent Name of the parent struct to which the column will be added. + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \return Reference to this for method chaining. + /// \throws InvalidArgument If parent doesn't identify a struct. + UpdateSchema& AddColumn(std::optional<std::string> parent, std::string_view name, + std::shared_ptr<Type> type); + + /// \brief Add a new optional column to a nested struct with documentation. + /// + /// The parent name is used to find the parent using Schema::FindFieldByName(). If + /// the parent name is null or empty, the new column will be added to the root as a + /// top-level column. If parent identifies a struct, a new column is added to that + /// struct. If it identifies a list, the column is added to the list element struct, + /// and if it identifies a map, the new column is added to the map's value struct. + /// + /// The given name is used to name the new column and names containing "." are not + /// handled differently. + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// The added column will be optional with a null default value. + /// + /// \param parent Name of the parent struct to which the column will be added. + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \param doc Documentation string for the new column. + /// \return Reference to this for method chaining. + /// \throws InvalidArgument If parent doesn't identify a struct. + UpdateSchema& AddColumn(std::optional<std::string> parent, std::string_view name, + std::shared_ptr<Type> type, std::string_view doc); + + /// \brief Add a new required top-level column. + /// + /// Adding a required column without a default is an incompatible change that can + /// break reading older data. To suppress exceptions thrown when an incompatible + /// change is detected, call AllowIncompatibleChanges(). + /// + /// Because "." may be interpreted as a column path separator or may be used in + /// field names, it is not allowed in names passed to this method. To add to nested + /// structures or to add fields with names that contain ".", use + /// AddRequiredColumn(parent, name, type). + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \return Reference to this for method chaining. + /// \throws InvalidArgument If name contains ".". + UpdateSchema& AddRequiredColumn(std::string_view name, std::shared_ptr<Type> type); + + /// \brief Add a new required top-level column with documentation. + /// + /// Adding a required column without a default is an incompatible change that can + /// break reading older data. To suppress exceptions thrown when an incompatible + /// change is detected, call AllowIncompatibleChanges(). + /// + /// Because "." may be interpreted as a column path separator or may be used in + /// field names, it is not allowed in names passed to this method. To add to nested + /// structures or to add fields with names that contain ".", use + /// AddRequiredColumn(parent, name, type, doc). + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \param doc Documentation string for the new column. + /// \return Reference to this for method chaining. + /// \throws InvalidArgument If name contains ".". + UpdateSchema& AddRequiredColumn(std::string_view name, std::shared_ptr<Type> type, + std::string_view doc); + + /// \brief Add a new required column to a nested struct. + /// + /// Adding a required column without a default is an incompatible change that can + /// break reading older data. To suppress exceptions thrown when an incompatible + /// change is detected, call AllowIncompatibleChanges(). + /// + /// The parent name is used to find the parent using Schema::FindFieldByName(). If + /// the parent name is null or empty, the new column will be added to the root as a + /// top-level column. If parent identifies a struct, a new column is added to that + /// struct. If it identifies a list, the column is added to the list element struct, + /// and if it identifies a map, the new column is added to the map's value struct. + /// + /// The given name is used to name the new column and names containing "." are not + /// handled differently. + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// \param parent Name of the parent struct to which the column will be added. + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \return Reference to this for method chaining. + /// \throws InvalidArgument If parent doesn't identify a struct. + UpdateSchema& AddRequiredColumn(std::optional<std::string> parent, + std::string_view name, std::shared_ptr<Type> type); + + /// \brief Add a new required column to a nested struct with documentation. + /// + /// Adding a required column without a default is an incompatible change that can + /// break reading older data. To suppress exceptions thrown when an incompatible + /// change is detected, call AllowIncompatibleChanges(). + /// + /// The parent name is used to find the parent using Schema::FindFieldByName(). If + /// the parent name is null or empty, the new column will be added to the root as a + /// top-level column. If parent identifies a struct, a new column is added to that + /// struct. If it identifies a list, the column is added to the list element struct, + /// and if it identifies a map, the new column is added to the map's value struct. + /// + /// The given name is used to name the new column and names containing "." are not + /// handled differently. + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// \param parent Name of the parent struct to which the column will be added. + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \param doc Documentation string for the new column. + /// \return Reference to this for method chaining. + /// \throws InvalidArgument If parent doesn't identify a struct. + UpdateSchema& AddRequiredColumn(std::optional<std::string> parent, + std::string_view name, std::shared_ptr<Type> type, + std::string_view doc); + + /// \brief Rename a column in the schema. + /// + /// The name is used to find the column to rename using Schema::FindFieldByName(). + /// + /// The new name may contain "." and such names are not parsed or handled + /// differently. + /// + /// Columns may be updated and renamed in the same schema update. + /// + /// \param name Name of the column to rename. + /// \param new_name Replacement name for the column. + /// \return Reference to this for method chaining. + /// \throws InvalidArgument If name doesn't identify a column in the schema or if + /// this change conflicts with other additions, renames, or updates. + UpdateSchema& RenameColumn(std::string_view name, std::string_view new_name); + + /// \brief Update a column in the schema to a new primitive type. + /// + /// The name is used to find the column to update using Schema::FindFieldByName(). + /// + /// Only updates that widen types are allowed. + /// + /// Columns may be updated and renamed in the same schema update. + /// + /// \param name Name of the column to update. + /// \param new_type Replacement type for the column (must be primitive). + /// \return Reference to this for method chaining. + /// \throws InvalidArgument If name doesn't identify a column in the schema or if + /// this change introduces a type incompatibility or if it conflicts with + /// other additions, renames, or updates. + UpdateSchema& UpdateColumn(std::string_view name, + std::shared_ptr<PrimitiveType> new_type); + + /// \brief Update a column in the schema to a new primitive type with documentation. + /// + /// The name is used to find the column to update using Schema::FindFieldByName(). + /// + /// Only updates that widen types are allowed. + /// + /// Columns may be updated and renamed in the same schema update. + /// + /// \param name Name of the column to update. + /// \param new_type Replacement type for the column (must be primitive). + /// \param new_doc Replacement documentation string for the column. + /// \return Reference to this for method chaining. + /// \throws InvalidArgument If name doesn't identify a column in the schema or if + /// this change introduces a type incompatibility or if it conflicts with + /// other additions, renames, or updates. + UpdateSchema& UpdateColumn(std::string_view name, Review Comment: I removed the interface that updates both type and doc at the same time. In the Java implementation, update can only modify one field at a time. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
