wgtmac commented on code in PR #460: URL: https://github.com/apache/iceberg-cpp/pull/460#discussion_r2654932746
########## src/iceberg/update/update_schema.h: ########## @@ -0,0 +1,402 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/update/update_schema.h +/// API for schema evolution. + +#include <memory> +#include <optional> +#include <span> +#include <string> +#include <string_view> +#include <unordered_set> + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" +#include "iceberg/update/pending_update.h" + +namespace iceberg { + +/// \brief API for schema evolution. +/// +/// When committing, these changes will be applied to the current table metadata. +/// Commit conflicts will not be resolved and will result in a CommitFailed error. +/// +/// TODO(Guotao Yu): Add support for V3 default values when adding columns. Currently, all +/// added columns use null as the default value, but Iceberg V3 supports custom +/// default values for new columns. +class ICEBERG_EXPORT UpdateSchema : public PendingUpdate { + public: + static Result<std::shared_ptr<UpdateSchema>> Make( + std::shared_ptr<Transaction> transaction); + + ~UpdateSchema() override; + + /// \brief Allow incompatible changes to the schema. + /// + /// Incompatible changes can cause failures when attempting to read older data files. + /// For example, adding a required column and attempting to read data files without + /// that column will cause a failure. However, if there are no data files that are + /// not compatible with the change, it can be allowed. + /// + /// This option allows incompatible changes to be made to a schema. This should be + /// used when the caller has validated that the change will not break. For example, + /// if a column is added as optional but always populated and data older than the + /// column addition has been deleted from the table, this can be used with + /// RequireColumn() to mark the column required. + /// + /// \return Reference to this for method chaining. + UpdateSchema& AllowIncompatibleChanges(); + + /// \brief Add a new optional top-level column with documentation. + /// + /// Because "." may be interpreted as a column path separator or may be used in + /// field names, it is not allowed in names passed to this method. To add to nested + /// structures or to add fields with names that contain ".", use AddColumn(parent, + /// name, type, doc). + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// The added column will be optional with a null default value. + /// + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \param doc Documentation string for the new column. + /// \return Reference to this for method chaining. + /// \note InvalidArgument will be reported if name contains ".". + UpdateSchema& AddColumn(std::string_view name, std::shared_ptr<Type> type, + std::string_view doc = ""); + + /// \brief Add a new optional column to a nested struct with documentation. + /// + /// The parent name is used to find the parent using Schema::FindFieldByName(). If + /// the parent name is null or empty, the new column will be added to the root as a + /// top-level column. If parent identifies a struct, a new column is added to that + /// struct. If it identifies a list, the column is added to the list element struct, + /// and if it identifies a map, the new column is added to the map's value struct. + /// + /// The given name is used to name the new column and names containing "." are not + /// handled differently. + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// The added column will be optional with a null default value. + /// + /// \param parent Name of the parent struct to which the column will be added. + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \param doc Documentation string for the new column. + /// \return Reference to this for method chaining. + /// \note InvalidArgument will be reported if parent doesn't identify a struct. + UpdateSchema& AddColumn(std::optional<std::string_view> parent, std::string_view name, + std::shared_ptr<Type> type, std::string_view doc = ""); + + /// \brief Add a new required top-level column. + /// + /// Adding a required column without a default is an incompatible change that can + /// break reading older data. To suppress exceptions thrown when an incompatible + /// change is detected, call AllowIncompatibleChanges(). + /// + /// Because "." may be interpreted as a column path separator or may be used in + /// field names, it is not allowed in names passed to this method. To add to nested + /// structures or to add fields with names that contain ".", use + /// AddRequiredColumn(parent, name, type). + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \return Reference to this for method chaining. + /// \note InvalidArgument will be reported if name contains ".". + UpdateSchema& AddRequiredColumn(std::string_view name, std::shared_ptr<Type> type); + + /// \brief Add a new required top-level column with documentation. + /// + /// Adding a required column without a default is an incompatible change that can + /// break reading older data. To suppress exceptions thrown when an incompatible + /// change is detected, call AllowIncompatibleChanges(). + /// + /// Because "." may be interpreted as a column path separator or may be used in + /// field names, it is not allowed in names passed to this method. To add to nested + /// structures or to add fields with names that contain ".", use + /// AddRequiredColumn(parent, name, type, doc). + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \param doc Documentation string for the new column. + /// \return Reference to this for method chaining. + /// \note InvalidArgument will be reported if name contains ".". + UpdateSchema& AddRequiredColumn(std::string_view name, std::shared_ptr<Type> type, + std::string_view doc); + + /// \brief Add a new required column to a nested struct. + /// + /// Adding a required column without a default is an incompatible change that can + /// break reading older data. To suppress exceptions thrown when an incompatible + /// change is detected, call AllowIncompatibleChanges(). + /// + /// The parent name is used to find the parent using Schema::FindFieldByName(). If + /// the parent name is null or empty, the new column will be added to the root as a + /// top-level column. If parent identifies a struct, a new column is added to that + /// struct. If it identifies a list, the column is added to the list element struct, + /// and if it identifies a map, the new column is added to the map's value struct. + /// + /// The given name is used to name the new column and names containing "." are not + /// handled differently. + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// \param parent Name of the parent struct to which the column will be added. + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \return Reference to this for method chaining. + /// \note InvalidArgument will be reported if parent doesn't identify a struct. + UpdateSchema& AddRequiredColumn(std::optional<std::string_view> parent, + std::string_view name, std::shared_ptr<Type> type); + + /// \brief Add a new required column to a nested struct with documentation. + /// + /// Adding a required column without a default is an incompatible change that can + /// break reading older data. To suppress exceptions thrown when an incompatible + /// change is detected, call AllowIncompatibleChanges(). + /// + /// The parent name is used to find the parent using Schema::FindFieldByName(). If + /// the parent name is null or empty, the new column will be added to the root as a + /// top-level column. If parent identifies a struct, a new column is added to that + /// struct. If it identifies a list, the column is added to the list element struct, + /// and if it identifies a map, the new column is added to the map's value struct. + /// + /// The given name is used to name the new column and names containing "." are not + /// handled differently. + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// \param parent Name of the parent struct to which the column will be added. + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \param doc Documentation string for the new column. + /// \return Reference to this for method chaining. + /// \note InvalidArgument will be reported if parent doesn't identify a struct. + UpdateSchema& AddRequiredColumn(std::optional<std::string_view> parent, + std::string_view name, std::shared_ptr<Type> type, + std::string_view doc); Review Comment: ```suggestion /// \brief Add a new required column to a nested struct with documentation. /// /// Adding a required column without a default is an incompatible change that can /// break reading older data. To suppress exceptions thrown when an incompatible /// change is detected, call AllowIncompatibleChanges(). /// /// The parent name is used to find the parent using Schema::FindFieldByName(). If /// the parent name is null or empty, the new column will be added to the root as a /// top-level column. If parent identifies a struct, a new column is added to that /// struct. If it identifies a list, the column is added to the list element struct, /// and if it identifies a map, the new column is added to the map's value struct. /// /// The given name is used to name the new column and names containing "." are not /// handled differently. /// /// If type is a nested type, its field IDs are reassigned when added to the /// existing schema. /// /// \param parent Name of the parent struct to which the column will be added. /// \param name Name for the new column. /// \param type Type for the new column. /// \param doc Documentation string for the new column. /// \return Reference to this for method chaining. /// \note InvalidArgument will be reported if parent doesn't identify a struct. UpdateSchema& AddRequiredColumn(std::optional<std::string_view> parent, std::string_view name, std::shared_ptr<Type> type, std::string_view doc = ""); ``` ########## src/iceberg/update/update_schema.cc: ########## @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/update/update_schema.h" + +#include <memory> +#include <optional> +#include <ranges> +#include <string> +#include <string_view> +#include <unordered_set> +#include <utility> + +#include "iceberg/schema.h" +#include "iceberg/table_metadata.h" +#include "iceberg/transaction.h" +#include "iceberg/type.h" +#include "iceberg/util/error_collector.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +Result<std::shared_ptr<UpdateSchema>> UpdateSchema::Make( + std::shared_ptr<Transaction> transaction) { + ICEBERG_PRECHECK(transaction != nullptr, + "Cannot create UpdateSchema without transaction"); + return std::shared_ptr<UpdateSchema>(new UpdateSchema(std::move(transaction))); +} + +UpdateSchema::UpdateSchema(std::shared_ptr<Transaction> transaction) + : PendingUpdate(std::move(transaction)) { + const TableMetadata& base_metadata = transaction_->current(); + + // Get the current schema + auto schema_result = base_metadata.Schema(); + if (!schema_result.has_value()) { + AddError(schema_result.error()); + return; + } + schema_ = std::move(schema_result.value()); + + // Initialize last_column_id from base metadata + last_column_id_ = base_metadata.last_column_id; + + // Initialize identifier field names from the current schema + auto identifier_names_result = schema_->IdentifierFieldNames(); + if (!identifier_names_result.has_value()) { + AddError(identifier_names_result.error()); + return; + } + identifier_field_names_ = identifier_names_result.value() | + std::ranges::to<std::unordered_set<std::string>>(); +} + +UpdateSchema::~UpdateSchema() = default; + +UpdateSchema& UpdateSchema::AllowIncompatibleChanges() { + allow_incompatible_changes_ = true; + return *this; +} + +UpdateSchema& UpdateSchema::CaseSensitive(bool case_sensitive) { + case_sensitive_ = case_sensitive; + return *this; +} + +UpdateSchema& UpdateSchema::AddColumn(std::string_view name, std::shared_ptr<Type> type, + std::string_view doc) { + // Check for "." in top-level name + ICEBERG_BUILDER_CHECK(!name.contains('.'), + "Cannot add column with ambiguous name: {}, use " + "AddColumn(parent, name, type, doc)", + name); + return AddColumnInternal(std::nullopt, name, /*is_optional=*/true, std::move(type), + doc); +} + +UpdateSchema& UpdateSchema::AddColumn(std::optional<std::string_view> parent, + std::string_view name, std::shared_ptr<Type> type, + std::string_view doc) { + return AddColumnInternal(std::move(parent), name, /*is_optional=*/true, std::move(type), + doc); +} + +UpdateSchema& UpdateSchema::AddRequiredColumn(std::string_view name, + std::shared_ptr<Type> type, + std::string_view doc) { + // Check for "." in top-level name + ICEBERG_BUILDER_CHECK(!name.contains('.'), + "Cannot add column with ambiguous name: {}, use " + "AddRequiredColumn(parent, name, type, doc)", + name); + return AddColumnInternal(std::nullopt, name, /*is_optional=*/false, std::move(type), + doc); +} + +UpdateSchema& UpdateSchema::AddRequiredColumn(std::optional<std::string_view> parent, + std::string_view name, + std::shared_ptr<Type> type) { + return AddColumnInternal(std::move(parent), name, /*is_optional=*/false, + std::move(type), ""); +} + Review Comment: ```suggestion ``` ########## src/iceberg/update/update_schema.h: ########## @@ -0,0 +1,402 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/update/update_schema.h +/// API for schema evolution. + +#include <memory> +#include <optional> +#include <span> +#include <string> +#include <string_view> +#include <unordered_set> + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" +#include "iceberg/update/pending_update.h" + +namespace iceberg { + +/// \brief API for schema evolution. +/// +/// When committing, these changes will be applied to the current table metadata. +/// Commit conflicts will not be resolved and will result in a CommitFailed error. +/// +/// TODO(Guotao Yu): Add support for V3 default values when adding columns. Currently, all +/// added columns use null as the default value, but Iceberg V3 supports custom +/// default values for new columns. +class ICEBERG_EXPORT UpdateSchema : public PendingUpdate { + public: + static Result<std::shared_ptr<UpdateSchema>> Make( + std::shared_ptr<Transaction> transaction); + + ~UpdateSchema() override; + + /// \brief Allow incompatible changes to the schema. + /// + /// Incompatible changes can cause failures when attempting to read older data files. + /// For example, adding a required column and attempting to read data files without + /// that column will cause a failure. However, if there are no data files that are + /// not compatible with the change, it can be allowed. + /// + /// This option allows incompatible changes to be made to a schema. This should be + /// used when the caller has validated that the change will not break. For example, + /// if a column is added as optional but always populated and data older than the + /// column addition has been deleted from the table, this can be used with + /// RequireColumn() to mark the column required. + /// + /// \return Reference to this for method chaining. + UpdateSchema& AllowIncompatibleChanges(); + + /// \brief Add a new optional top-level column with documentation. + /// + /// Because "." may be interpreted as a column path separator or may be used in + /// field names, it is not allowed in names passed to this method. To add to nested + /// structures or to add fields with names that contain ".", use AddColumn(parent, + /// name, type, doc). + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// The added column will be optional with a null default value. + /// + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \param doc Documentation string for the new column. + /// \return Reference to this for method chaining. + /// \note InvalidArgument will be reported if name contains ".". + UpdateSchema& AddColumn(std::string_view name, std::shared_ptr<Type> type, + std::string_view doc = ""); + + /// \brief Add a new optional column to a nested struct with documentation. + /// + /// The parent name is used to find the parent using Schema::FindFieldByName(). If + /// the parent name is null or empty, the new column will be added to the root as a + /// top-level column. If parent identifies a struct, a new column is added to that + /// struct. If it identifies a list, the column is added to the list element struct, + /// and if it identifies a map, the new column is added to the map's value struct. + /// + /// The given name is used to name the new column and names containing "." are not + /// handled differently. + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// The added column will be optional with a null default value. + /// + /// \param parent Name of the parent struct to which the column will be added. + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \param doc Documentation string for the new column. + /// \return Reference to this for method chaining. + /// \note InvalidArgument will be reported if parent doesn't identify a struct. + UpdateSchema& AddColumn(std::optional<std::string_view> parent, std::string_view name, + std::shared_ptr<Type> type, std::string_view doc = ""); + + /// \brief Add a new required top-level column. + /// + /// Adding a required column without a default is an incompatible change that can + /// break reading older data. To suppress exceptions thrown when an incompatible + /// change is detected, call AllowIncompatibleChanges(). + /// + /// Because "." may be interpreted as a column path separator or may be used in + /// field names, it is not allowed in names passed to this method. To add to nested + /// structures or to add fields with names that contain ".", use + /// AddRequiredColumn(parent, name, type). + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \return Reference to this for method chaining. + /// \note InvalidArgument will be reported if name contains ".". + UpdateSchema& AddRequiredColumn(std::string_view name, std::shared_ptr<Type> type); + + /// \brief Add a new required top-level column with documentation. + /// + /// Adding a required column without a default is an incompatible change that can + /// break reading older data. To suppress exceptions thrown when an incompatible + /// change is detected, call AllowIncompatibleChanges(). + /// + /// Because "." may be interpreted as a column path separator or may be used in + /// field names, it is not allowed in names passed to this method. To add to nested + /// structures or to add fields with names that contain ".", use + /// AddRequiredColumn(parent, name, type, doc). + /// + /// If type is a nested type, its field IDs are reassigned when added to the + /// existing schema. + /// + /// \param name Name for the new column. + /// \param type Type for the new column. + /// \param doc Documentation string for the new column. + /// \return Reference to this for method chaining. + /// \note InvalidArgument will be reported if name contains ".". + UpdateSchema& AddRequiredColumn(std::string_view name, std::shared_ptr<Type> type, + std::string_view doc); Review Comment: ```suggestion /// \brief Add a new required top-level column with documentation. /// /// Adding a required column without a default is an incompatible change that can /// break reading older data. To suppress exceptions thrown when an incompatible /// change is detected, call AllowIncompatibleChanges(). /// /// Because "." may be interpreted as a column path separator or may be used in /// field names, it is not allowed in names passed to this method. To add to nested /// structures or to add fields with names that contain ".", use /// AddRequiredColumn(parent, name, type, doc). /// /// If type is a nested type, its field IDs are reassigned when added to the /// existing schema. /// /// \param name Name for the new column. /// \param type Type for the new column. /// \param doc Documentation string for the new column. /// \return Reference to this for method chaining. /// \note InvalidArgument will be reported if name contains ".". UpdateSchema& AddRequiredColumn(std::string_view name, std::shared_ptr<Type> type, std::string_view doc = ""); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
