wgtmac commented on code in PR #460:
URL: https://github.com/apache/iceberg-cpp/pull/460#discussion_r2654932746


##########
src/iceberg/update/update_schema.h:
##########
@@ -0,0 +1,402 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/update/update_schema.h
+/// API for schema evolution.
+
+#include <memory>
+#include <optional>
+#include <span>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
+#include "iceberg/update/pending_update.h"
+
+namespace iceberg {
+
+/// \brief API for schema evolution.
+///
+/// When committing, these changes will be applied to the current table 
metadata.
+/// Commit conflicts will not be resolved and will result in a CommitFailed 
error.
+///
+/// TODO(Guotao Yu): Add support for V3 default values when adding columns. 
Currently, all
+/// added columns use null as the default value, but Iceberg V3 supports custom
+/// default values for new columns.
+class ICEBERG_EXPORT UpdateSchema : public PendingUpdate {
+ public:
+  static Result<std::shared_ptr<UpdateSchema>> Make(
+      std::shared_ptr<Transaction> transaction);
+
+  ~UpdateSchema() override;
+
+  /// \brief Allow incompatible changes to the schema.
+  ///
+  /// Incompatible changes can cause failures when attempting to read older 
data files.
+  /// For example, adding a required column and attempting to read data files 
without
+  /// that column will cause a failure. However, if there are no data files 
that are
+  /// not compatible with the change, it can be allowed.
+  ///
+  /// This option allows incompatible changes to be made to a schema. This 
should be
+  /// used when the caller has validated that the change will not break. For 
example,
+  /// if a column is added as optional but always populated and data older 
than the
+  /// column addition has been deleted from the table, this can be used with
+  /// RequireColumn() to mark the column required.
+  ///
+  /// \return Reference to this for method chaining.
+  UpdateSchema& AllowIncompatibleChanges();
+
+  /// \brief Add a new optional top-level column with documentation.
+  ///
+  /// Because "." may be interpreted as a column path separator or may be used 
in
+  /// field names, it is not allowed in names passed to this method. To add to 
nested
+  /// structures or to add fields with names that contain ".", use 
AddColumn(parent,
+  /// name, type, doc).
+  ///
+  /// If type is a nested type, its field IDs are reassigned when added to the
+  /// existing schema.
+  ///
+  /// The added column will be optional with a null default value.
+  ///
+  /// \param name Name for the new column.
+  /// \param type Type for the new column.
+  /// \param doc Documentation string for the new column.
+  /// \return Reference to this for method chaining.
+  /// \note InvalidArgument will be reported if name contains ".".
+  UpdateSchema& AddColumn(std::string_view name, std::shared_ptr<Type> type,
+                          std::string_view doc = "");
+
+  /// \brief Add a new optional column to a nested struct with documentation.
+  ///
+  /// The parent name is used to find the parent using 
Schema::FindFieldByName(). If
+  /// the parent name is null or empty, the new column will be added to the 
root as a
+  /// top-level column. If parent identifies a struct, a new column is added 
to that
+  /// struct. If it identifies a list, the column is added to the list element 
struct,
+  /// and if it identifies a map, the new column is added to the map's value 
struct.
+  ///
+  /// The given name is used to name the new column and names containing "." 
are not
+  /// handled differently.
+  ///
+  /// If type is a nested type, its field IDs are reassigned when added to the
+  /// existing schema.
+  ///
+  /// The added column will be optional with a null default value.
+  ///
+  /// \param parent Name of the parent struct to which the column will be 
added.
+  /// \param name Name for the new column.
+  /// \param type Type for the new column.
+  /// \param doc Documentation string for the new column.
+  /// \return Reference to this for method chaining.
+  /// \note InvalidArgument will be reported if parent doesn't identify a 
struct.
+  UpdateSchema& AddColumn(std::optional<std::string_view> parent, 
std::string_view name,
+                          std::shared_ptr<Type> type, std::string_view doc = 
"");
+
+  /// \brief Add a new required top-level column.
+  ///
+  /// Adding a required column without a default is an incompatible change 
that can
+  /// break reading older data. To suppress exceptions thrown when an 
incompatible
+  /// change is detected, call AllowIncompatibleChanges().
+  ///
+  /// Because "." may be interpreted as a column path separator or may be used 
in
+  /// field names, it is not allowed in names passed to this method. To add to 
nested
+  /// structures or to add fields with names that contain ".", use
+  /// AddRequiredColumn(parent, name, type).
+  ///
+  /// If type is a nested type, its field IDs are reassigned when added to the
+  /// existing schema.
+  ///
+  /// \param name Name for the new column.
+  /// \param type Type for the new column.
+  /// \return Reference to this for method chaining.
+  /// \note InvalidArgument will be reported if name contains ".".
+  UpdateSchema& AddRequiredColumn(std::string_view name, std::shared_ptr<Type> 
type);
+
+  /// \brief Add a new required top-level column with documentation.
+  ///
+  /// Adding a required column without a default is an incompatible change 
that can
+  /// break reading older data. To suppress exceptions thrown when an 
incompatible
+  /// change is detected, call AllowIncompatibleChanges().
+  ///
+  /// Because "." may be interpreted as a column path separator or may be used 
in
+  /// field names, it is not allowed in names passed to this method. To add to 
nested
+  /// structures or to add fields with names that contain ".", use
+  /// AddRequiredColumn(parent, name, type, doc).
+  ///
+  /// If type is a nested type, its field IDs are reassigned when added to the
+  /// existing schema.
+  ///
+  /// \param name Name for the new column.
+  /// \param type Type for the new column.
+  /// \param doc Documentation string for the new column.
+  /// \return Reference to this for method chaining.
+  /// \note InvalidArgument will be reported if name contains ".".
+  UpdateSchema& AddRequiredColumn(std::string_view name, std::shared_ptr<Type> 
type,
+                                  std::string_view doc);
+
+  /// \brief Add a new required column to a nested struct.
+  ///
+  /// Adding a required column without a default is an incompatible change 
that can
+  /// break reading older data. To suppress exceptions thrown when an 
incompatible
+  /// change is detected, call AllowIncompatibleChanges().
+  ///
+  /// The parent name is used to find the parent using 
Schema::FindFieldByName(). If
+  /// the parent name is null or empty, the new column will be added to the 
root as a
+  /// top-level column. If parent identifies a struct, a new column is added 
to that
+  /// struct. If it identifies a list, the column is added to the list element 
struct,
+  /// and if it identifies a map, the new column is added to the map's value 
struct.
+  ///
+  /// The given name is used to name the new column and names containing "." 
are not
+  /// handled differently.
+  ///
+  /// If type is a nested type, its field IDs are reassigned when added to the
+  /// existing schema.
+  ///
+  /// \param parent Name of the parent struct to which the column will be 
added.
+  /// \param name Name for the new column.
+  /// \param type Type for the new column.
+  /// \return Reference to this for method chaining.
+  /// \note InvalidArgument will be reported if parent doesn't identify a 
struct.
+  UpdateSchema& AddRequiredColumn(std::optional<std::string_view> parent,
+                                  std::string_view name, std::shared_ptr<Type> 
type);
+
+  /// \brief Add a new required column to a nested struct with documentation.
+  ///
+  /// Adding a required column without a default is an incompatible change 
that can
+  /// break reading older data. To suppress exceptions thrown when an 
incompatible
+  /// change is detected, call AllowIncompatibleChanges().
+  ///
+  /// The parent name is used to find the parent using 
Schema::FindFieldByName(). If
+  /// the parent name is null or empty, the new column will be added to the 
root as a
+  /// top-level column. If parent identifies a struct, a new column is added 
to that
+  /// struct. If it identifies a list, the column is added to the list element 
struct,
+  /// and if it identifies a map, the new column is added to the map's value 
struct.
+  ///
+  /// The given name is used to name the new column and names containing "." 
are not
+  /// handled differently.
+  ///
+  /// If type is a nested type, its field IDs are reassigned when added to the
+  /// existing schema.
+  ///
+  /// \param parent Name of the parent struct to which the column will be 
added.
+  /// \param name Name for the new column.
+  /// \param type Type for the new column.
+  /// \param doc Documentation string for the new column.
+  /// \return Reference to this for method chaining.
+  /// \note InvalidArgument will be reported if parent doesn't identify a 
struct.
+  UpdateSchema& AddRequiredColumn(std::optional<std::string_view> parent,
+                                  std::string_view name, std::shared_ptr<Type> 
type,
+                                  std::string_view doc);

Review Comment:
   ```suggestion
     /// \brief Add a new required column to a nested struct with documentation.
     ///
     /// Adding a required column without a default is an incompatible change 
that can
     /// break reading older data. To suppress exceptions thrown when an 
incompatible
     /// change is detected, call AllowIncompatibleChanges().
     ///
     /// The parent name is used to find the parent using 
Schema::FindFieldByName(). If
     /// the parent name is null or empty, the new column will be added to the 
root as a
     /// top-level column. If parent identifies a struct, a new column is added 
to that
     /// struct. If it identifies a list, the column is added to the list 
element struct,
     /// and if it identifies a map, the new column is added to the map's value 
struct.
     ///
     /// The given name is used to name the new column and names containing "." 
are not
     /// handled differently.
     ///
     /// If type is a nested type, its field IDs are reassigned when added to 
the
     /// existing schema.
     ///
     /// \param parent Name of the parent struct to which the column will be 
added.
     /// \param name Name for the new column.
     /// \param type Type for the new column.
     /// \param doc Documentation string for the new column.
     /// \return Reference to this for method chaining.
     /// \note InvalidArgument will be reported if parent doesn't identify a 
struct.
     UpdateSchema& AddRequiredColumn(std::optional<std::string_view> parent,
                                     std::string_view name, 
std::shared_ptr<Type> type,
                                     std::string_view doc = "");
   ```



##########
src/iceberg/update/update_schema.cc:
##########
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/update/update_schema.h"
+
+#include <memory>
+#include <optional>
+#include <ranges>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <utility>
+
+#include "iceberg/schema.h"
+#include "iceberg/table_metadata.h"
+#include "iceberg/transaction.h"
+#include "iceberg/type.h"
+#include "iceberg/util/error_collector.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+Result<std::shared_ptr<UpdateSchema>> UpdateSchema::Make(
+    std::shared_ptr<Transaction> transaction) {
+  ICEBERG_PRECHECK(transaction != nullptr,
+                   "Cannot create UpdateSchema without transaction");
+  return std::shared_ptr<UpdateSchema>(new 
UpdateSchema(std::move(transaction)));
+}
+
+UpdateSchema::UpdateSchema(std::shared_ptr<Transaction> transaction)
+    : PendingUpdate(std::move(transaction)) {
+  const TableMetadata& base_metadata = transaction_->current();
+
+  // Get the current schema
+  auto schema_result = base_metadata.Schema();
+  if (!schema_result.has_value()) {
+    AddError(schema_result.error());
+    return;
+  }
+  schema_ = std::move(schema_result.value());
+
+  // Initialize last_column_id from base metadata
+  last_column_id_ = base_metadata.last_column_id;
+
+  // Initialize identifier field names from the current schema
+  auto identifier_names_result = schema_->IdentifierFieldNames();
+  if (!identifier_names_result.has_value()) {
+    AddError(identifier_names_result.error());
+    return;
+  }
+  identifier_field_names_ = identifier_names_result.value() |
+                            std::ranges::to<std::unordered_set<std::string>>();
+}
+
+UpdateSchema::~UpdateSchema() = default;
+
+UpdateSchema& UpdateSchema::AllowIncompatibleChanges() {
+  allow_incompatible_changes_ = true;
+  return *this;
+}
+
+UpdateSchema& UpdateSchema::CaseSensitive(bool case_sensitive) {
+  case_sensitive_ = case_sensitive;
+  return *this;
+}
+
+UpdateSchema& UpdateSchema::AddColumn(std::string_view name, 
std::shared_ptr<Type> type,
+                                      std::string_view doc) {
+  // Check for "." in top-level name
+  ICEBERG_BUILDER_CHECK(!name.contains('.'),
+                        "Cannot add column with ambiguous name: {}, use "
+                        "AddColumn(parent, name, type, doc)",
+                        name);
+  return AddColumnInternal(std::nullopt, name, /*is_optional=*/true, 
std::move(type),
+                           doc);
+}
+
+UpdateSchema& UpdateSchema::AddColumn(std::optional<std::string_view> parent,
+                                      std::string_view name, 
std::shared_ptr<Type> type,
+                                      std::string_view doc) {
+  return AddColumnInternal(std::move(parent), name, /*is_optional=*/true, 
std::move(type),
+                           doc);
+}
+
+UpdateSchema& UpdateSchema::AddRequiredColumn(std::string_view name,
+                                              std::shared_ptr<Type> type,
+                                              std::string_view doc) {
+  // Check for "." in top-level name
+  ICEBERG_BUILDER_CHECK(!name.contains('.'),
+                        "Cannot add column with ambiguous name: {}, use "
+                        "AddRequiredColumn(parent, name, type, doc)",
+                        name);
+  return AddColumnInternal(std::nullopt, name, /*is_optional=*/false, 
std::move(type),
+                           doc);
+}
+
+UpdateSchema& UpdateSchema::AddRequiredColumn(std::optional<std::string_view> 
parent,
+                                              std::string_view name,
+                                              std::shared_ptr<Type> type) {
+  return AddColumnInternal(std::move(parent), name, /*is_optional=*/false,
+                           std::move(type), "");
+}
+

Review Comment:
   ```suggestion
   ```



##########
src/iceberg/update/update_schema.h:
##########
@@ -0,0 +1,402 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/update/update_schema.h
+/// API for schema evolution.
+
+#include <memory>
+#include <optional>
+#include <span>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
+#include "iceberg/update/pending_update.h"
+
+namespace iceberg {
+
+/// \brief API for schema evolution.
+///
+/// When committing, these changes will be applied to the current table 
metadata.
+/// Commit conflicts will not be resolved and will result in a CommitFailed 
error.
+///
+/// TODO(Guotao Yu): Add support for V3 default values when adding columns. 
Currently, all
+/// added columns use null as the default value, but Iceberg V3 supports custom
+/// default values for new columns.
+class ICEBERG_EXPORT UpdateSchema : public PendingUpdate {
+ public:
+  static Result<std::shared_ptr<UpdateSchema>> Make(
+      std::shared_ptr<Transaction> transaction);
+
+  ~UpdateSchema() override;
+
+  /// \brief Allow incompatible changes to the schema.
+  ///
+  /// Incompatible changes can cause failures when attempting to read older 
data files.
+  /// For example, adding a required column and attempting to read data files 
without
+  /// that column will cause a failure. However, if there are no data files 
that are
+  /// not compatible with the change, it can be allowed.
+  ///
+  /// This option allows incompatible changes to be made to a schema. This 
should be
+  /// used when the caller has validated that the change will not break. For 
example,
+  /// if a column is added as optional but always populated and data older 
than the
+  /// column addition has been deleted from the table, this can be used with
+  /// RequireColumn() to mark the column required.
+  ///
+  /// \return Reference to this for method chaining.
+  UpdateSchema& AllowIncompatibleChanges();
+
+  /// \brief Add a new optional top-level column with documentation.
+  ///
+  /// Because "." may be interpreted as a column path separator or may be used 
in
+  /// field names, it is not allowed in names passed to this method. To add to 
nested
+  /// structures or to add fields with names that contain ".", use 
AddColumn(parent,
+  /// name, type, doc).
+  ///
+  /// If type is a nested type, its field IDs are reassigned when added to the
+  /// existing schema.
+  ///
+  /// The added column will be optional with a null default value.
+  ///
+  /// \param name Name for the new column.
+  /// \param type Type for the new column.
+  /// \param doc Documentation string for the new column.
+  /// \return Reference to this for method chaining.
+  /// \note InvalidArgument will be reported if name contains ".".
+  UpdateSchema& AddColumn(std::string_view name, std::shared_ptr<Type> type,
+                          std::string_view doc = "");
+
+  /// \brief Add a new optional column to a nested struct with documentation.
+  ///
+  /// The parent name is used to find the parent using 
Schema::FindFieldByName(). If
+  /// the parent name is null or empty, the new column will be added to the 
root as a
+  /// top-level column. If parent identifies a struct, a new column is added 
to that
+  /// struct. If it identifies a list, the column is added to the list element 
struct,
+  /// and if it identifies a map, the new column is added to the map's value 
struct.
+  ///
+  /// The given name is used to name the new column and names containing "." 
are not
+  /// handled differently.
+  ///
+  /// If type is a nested type, its field IDs are reassigned when added to the
+  /// existing schema.
+  ///
+  /// The added column will be optional with a null default value.
+  ///
+  /// \param parent Name of the parent struct to which the column will be 
added.
+  /// \param name Name for the new column.
+  /// \param type Type for the new column.
+  /// \param doc Documentation string for the new column.
+  /// \return Reference to this for method chaining.
+  /// \note InvalidArgument will be reported if parent doesn't identify a 
struct.
+  UpdateSchema& AddColumn(std::optional<std::string_view> parent, 
std::string_view name,
+                          std::shared_ptr<Type> type, std::string_view doc = 
"");
+
+  /// \brief Add a new required top-level column.
+  ///
+  /// Adding a required column without a default is an incompatible change 
that can
+  /// break reading older data. To suppress exceptions thrown when an 
incompatible
+  /// change is detected, call AllowIncompatibleChanges().
+  ///
+  /// Because "." may be interpreted as a column path separator or may be used 
in
+  /// field names, it is not allowed in names passed to this method. To add to 
nested
+  /// structures or to add fields with names that contain ".", use
+  /// AddRequiredColumn(parent, name, type).
+  ///
+  /// If type is a nested type, its field IDs are reassigned when added to the
+  /// existing schema.
+  ///
+  /// \param name Name for the new column.
+  /// \param type Type for the new column.
+  /// \return Reference to this for method chaining.
+  /// \note InvalidArgument will be reported if name contains ".".
+  UpdateSchema& AddRequiredColumn(std::string_view name, std::shared_ptr<Type> 
type);
+
+  /// \brief Add a new required top-level column with documentation.
+  ///
+  /// Adding a required column without a default is an incompatible change 
that can
+  /// break reading older data. To suppress exceptions thrown when an 
incompatible
+  /// change is detected, call AllowIncompatibleChanges().
+  ///
+  /// Because "." may be interpreted as a column path separator or may be used 
in
+  /// field names, it is not allowed in names passed to this method. To add to 
nested
+  /// structures or to add fields with names that contain ".", use
+  /// AddRequiredColumn(parent, name, type, doc).
+  ///
+  /// If type is a nested type, its field IDs are reassigned when added to the
+  /// existing schema.
+  ///
+  /// \param name Name for the new column.
+  /// \param type Type for the new column.
+  /// \param doc Documentation string for the new column.
+  /// \return Reference to this for method chaining.
+  /// \note InvalidArgument will be reported if name contains ".".
+  UpdateSchema& AddRequiredColumn(std::string_view name, std::shared_ptr<Type> 
type,
+                                  std::string_view doc);

Review Comment:
   ```suggestion
     /// \brief Add a new required top-level column with documentation.
     ///
     /// Adding a required column without a default is an incompatible change 
that can
     /// break reading older data. To suppress exceptions thrown when an 
incompatible
     /// change is detected, call AllowIncompatibleChanges().
     ///
     /// Because "." may be interpreted as a column path separator or may be 
used in
     /// field names, it is not allowed in names passed to this method. To add 
to nested
     /// structures or to add fields with names that contain ".", use
     /// AddRequiredColumn(parent, name, type, doc).
     ///
     /// If type is a nested type, its field IDs are reassigned when added to 
the
     /// existing schema.
     ///
     /// \param name Name for the new column.
     /// \param type Type for the new column.
     /// \param doc Documentation string for the new column.
     /// \return Reference to this for method chaining.
     /// \note InvalidArgument will be reported if name contains ".".
     UpdateSchema& AddRequiredColumn(std::string_view name, 
std::shared_ptr<Type> type,
                                     std::string_view doc = "");
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to