wgtmac commented on code in PR #516:
URL: https://github.com/apache/iceberg-cpp/pull/516#discussion_r2706658749


##########
src/iceberg/CMakeLists.txt:
##########
@@ -49,6 +49,7 @@ set(ICEBERG_SOURCES
     manifest/manifest_group.cc
     manifest/manifest_list.cc
     manifest/manifest_reader.cc
+    manifest/manifest_util_internal.cc

Review Comment:
   nit: only header file needs to add the internal prefix
   ```suggestion
       manifest/manifest_util.cc
   ```



##########
src/iceberg/update/fast_append.h:
##########
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/update/fast_append.h
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/manifest/manifest_list.h"

Review Comment:
   ```suggestion
   ```



##########
src/iceberg/test/fast_append_test.cc:
##########
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/update/fast_append.h"
+
+#include <format>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "iceberg/avro/avro_register.h"
+#include "iceberg/partition_spec.h"
+#include "iceberg/schema.h"
+#include "iceberg/table_metadata.h"
+#include "iceberg/test/matchers.h"
+#include "iceberg/test/test_resource.h"
+#include "iceberg/test/update_test_base.h"
+#include "iceberg/util/uuid.h"
+
+namespace iceberg {
+
+class FastAppendTest : public UpdateTestBase {
+ protected:
+  static void SetUpTestSuite() { avro::RegisterAll(); }
+
+  void SetUp() override {
+    UpdateTestBase::SetUp();
+
+    ASSERT_THAT(catalog_->DropTable(table_ident_, /*purge=*/false), IsOk());

Review Comment:
   It looks weird to drop table first. Perhaps we should not call 
`UpdateTestBase::SetUp()`. 



##########
src/iceberg/update/fast_append.cc:
##########
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/update/fast_append.h"
+
+#include <iterator>
+#include <ranges>
+#include <vector>
+
+#include "iceberg/constants.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/manifest/manifest_util_internal.h"
+#include "iceberg/snapshot.h"
+#include "iceberg/table.h"
+#include "iceberg/table_metadata.h"
+#include "iceberg/table_properties.h"
+#include "iceberg/transaction.h"
+#include "iceberg/util/error_collector.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+Result<std::unique_ptr<FastAppend>> FastAppend::Make(
+    std::string table_name, std::shared_ptr<Transaction> transaction) {
+  ICEBERG_PRECHECK(!table_name.empty(), "Table name cannot be empty");
+  ICEBERG_PRECHECK(transaction != nullptr,
+                   "Cannot create FastAppend without a transaction");
+  return std::unique_ptr<FastAppend>(
+      new FastAppend(std::move(table_name), std::move(transaction)));
+}
+
+FastAppend::FastAppend(std::string table_name, std::shared_ptr<Transaction> 
transaction)
+    : SnapshotUpdate(std::move(transaction)), 
table_name_(std::move(table_name)) {}
+
+FastAppend& FastAppend::AppendFile(std::shared_ptr<DataFile> file) {

Review Comment:
   ```suggestion
   FastAppend& FastAppend::AppendFile(const std::shared_ptr<DataFile>& file) {
   ```
   
   This saves a copy.



##########
src/iceberg/update/snapshot_update.h:
##########
@@ -175,9 +201,7 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate {
   /// \brief Clean up all uncommitted files
   void CleanAll();
 
-  Status DeleteFile(const std::string& path);
   std::string ManifestListPath();

Review Comment:
   nit: we can move it together.



##########
src/iceberg/manifest/manifest_util_internal.cc:
##########
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/manifest/manifest_util_internal.h"
+
+#include <memory>
+#include <optional>
+
+#include "iceberg/inheritable_metadata.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/manifest/manifest_reader.h"
+#include "iceberg/manifest/manifest_writer.h"
+#include "iceberg/result.h"
+#include "iceberg/schema.h"
+#include "iceberg/snapshot.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+Result<ManifestFile> CopyAppendManifest(
+    const ManifestFile& manifest, std::shared_ptr<FileIO> file_io,
+    std::shared_ptr<Schema> schema, std::shared_ptr<PartitionSpec> spec,
+    int64_t snapshot_id, const std::string& output_path, int8_t format_version,
+    SnapshotSummaryBuilder* summary_builder) {
+  ICEBERG_ASSIGN_OR_RAISE(auto reader,
+                          ManifestReader::Make(manifest, file_io, schema, 
spec));
+  ICEBERG_ASSIGN_OR_RAISE(auto entries, reader->Entries());
+
+  // use metadata that will add the current snapshot's ID for the rewrite
+  ICEBERG_ASSIGN_OR_RAISE(auto inheritable_metadata,
+                          InheritableMetadataFactory::ForCopy(snapshot_id));
+
+  // do not produce row IDs for the copy
+  ICEBERG_ASSIGN_OR_RAISE(
+      auto writer,
+      ManifestWriter::MakeWriter(format_version, snapshot_id, output_path, 
file_io, spec,
+                                 schema, ManifestContent::kData));
+
+  // Write all entries as added entries with the new snapshot ID
+  for (auto& entry : entries) {
+    ICEBERG_PRECHECK(entry.status == ManifestStatus::kAdded,
+                     "Manifest to copy must only contain added entries");
+
+    ICEBERG_RETURN_UNEXPECTED(inheritable_metadata->Apply(entry));
+
+    if (summary_builder != nullptr && entry.data_file != nullptr) {
+      ICEBERG_RETURN_UNEXPECTED(summary_builder->AddedFile(*spec, 
*entry.data_file));
+    }
+
+    ICEBERG_RETURN_UNEXPECTED(writer->WriteAddedEntry(entry));
+  }
+
+  ICEBERG_RETURN_UNEXPECTED(writer->Close());
+  ICEBERG_ASSIGN_OR_RAISE(auto new_manifest, writer->ToManifestFile());
+
+  return new_manifest;

Review Comment:
   ```suggestion
     return writer->ToManifestFile();
   ```



##########
src/iceberg/update/fast_append.h:
##########
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/update/fast_append.h
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/manifest/manifest_list.h"
+#include "iceberg/result.h"
+#include "iceberg/snapshot.h"

Review Comment:
   ```suggestion
   ```



##########
src/iceberg/update/fast_append.cc:
##########
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/update/fast_append.h"
+
+#include <iterator>
+#include <ranges>
+#include <vector>
+
+#include "iceberg/constants.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/manifest/manifest_util_internal.h"
+#include "iceberg/snapshot.h"
+#include "iceberg/table.h"
+#include "iceberg/table_metadata.h"
+#include "iceberg/table_properties.h"
+#include "iceberg/transaction.h"
+#include "iceberg/util/error_collector.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+Result<std::unique_ptr<FastAppend>> FastAppend::Make(
+    std::string table_name, std::shared_ptr<Transaction> transaction) {
+  ICEBERG_PRECHECK(!table_name.empty(), "Table name cannot be empty");
+  ICEBERG_PRECHECK(transaction != nullptr,
+                   "Cannot create FastAppend without a transaction");
+  return std::unique_ptr<FastAppend>(
+      new FastAppend(std::move(table_name), std::move(transaction)));
+}
+
+FastAppend::FastAppend(std::string table_name, std::shared_ptr<Transaction> 
transaction)
+    : SnapshotUpdate(std::move(transaction)), 
table_name_(std::move(table_name)) {}
+
+FastAppend& FastAppend::AppendFile(std::shared_ptr<DataFile> file) {
+  ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid data file: null");
+  ICEBERG_BUILDER_CHECK(file->partition_spec_id.has_value(),
+                        "Data file must have partition spec ID");
+
+  int32_t spec_id = file->partition_spec_id.value();
+  ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto spec, Spec(spec_id));
+
+  auto& data_files = new_data_files_by_spec_[spec_id];
+  auto [iter, inserted] = data_files.insert(file);
+  if (inserted) {
+    has_new_files_ = true;
+    ICEBERG_BUILDER_RETURN_IF_ERROR(summary_.AddedFile(*spec, *file));
+  }
+
+  return *this;
+}
+
+FastAppend& FastAppend::AppendManifest(const ManifestFile& manifest) {
+  ICEBERG_BUILDER_CHECK(!manifest.has_existing_files(),
+                        "Cannot append manifest with existing files");
+  ICEBERG_BUILDER_CHECK(!manifest.has_deleted_files(),
+                        "Cannot append manifest with deleted files");
+  ICEBERG_BUILDER_CHECK(manifest.added_snapshot_id == kInvalidSnapshotId,
+                        "Snapshot id must be assigned during commit");
+  ICEBERG_BUILDER_CHECK(manifest.sequence_number == kInvalidSequenceNumber,
+                        "Sequence number must be assigned during commit");
+
+  if (can_inherit_snapshot_id() && manifest.added_snapshot_id == 
kInvalidSnapshotId) {
+    summary_.AddedManifest(manifest);
+    append_manifests_.push_back(manifest);
+  } else {
+    // The manifest must be rewritten with this update's snapshot ID
+    ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto copied_manifest, 
CopyManifest(manifest));
+    rewritten_append_manifests_.push_back(std::move(copied_manifest));
+  }
+
+  return *this;
+}
+
+FastAppend& FastAppend::ToBranch(const std::string& branch) {
+  ICEBERG_BUILDER_RETURN_IF_ERROR(SetTargetBranch(branch));
+  return *this;
+}
+
+std::string FastAppend::operation() { return DataOperation::kAppend; }
+
+Result<std::vector<ManifestFile>> FastAppend::Apply(
+    const TableMetadata& metadata_to_update, const std::shared_ptr<Snapshot>& 
snapshot) {
+  std::vector<ManifestFile> manifests;
+
+  ICEBERG_ASSIGN_OR_RAISE(auto new_written_manifests, WriteNewManifests());
+  manifests.reserve(new_written_manifests.size() + append_manifests_.size() +
+                    rewritten_append_manifests_.size());
+  if (!new_written_manifests.empty()) {
+    manifests.insert(manifests.end(),
+                     std::make_move_iterator(new_written_manifests.begin()),
+                     std::make_move_iterator(new_written_manifests.end()));
+  }
+
+  // Transform append manifests and rewritten append manifests with snapshot ID
+  int64_t snapshot_id = SnapshotId();
+  for (auto& manifest : append_manifests_) {
+    manifest.added_snapshot_id = snapshot_id;
+  }
+  for (auto& manifest : rewritten_append_manifests_) {
+    manifest.added_snapshot_id = snapshot_id;
+  }
+  manifests.insert(manifests.end(), append_manifests_.begin(), 
append_manifests_.end());
+  manifests.insert(manifests.end(), rewritten_append_manifests_.begin(),
+                   rewritten_append_manifests_.end());
+
+  // Add all manifests from the snapshot
+  if (snapshot != nullptr) {
+    // Use SnapshotCache to get manifests, similar to snapshot_update.cc

Review Comment:
   ```suggestion
   ```



##########
src/iceberg/update/fast_append.h:
##########
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/update/fast_append.h
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/manifest/manifest_list.h"
+#include "iceberg/result.h"
+#include "iceberg/snapshot.h"
+#include "iceberg/type_fwd.h"
+#include "iceberg/update/snapshot_update.h"
+#include "iceberg/util/content_file_util.h"
+
+namespace iceberg {
+
+/// \brief Append implementation that adds new manifest files for writes.

Review Comment:
   ```suggestion
   /// \brief Appending new files in a table.
   ```



##########
src/iceberg/manifest/manifest_util_internal.h:
##########
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/manifest/manifest_util_internal.h
+/// Internal utility functions for manifest operations.
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
+
+namespace iceberg {
+
+/// \brief Copy an append manifest with a new snapshot ID.
+///
+/// This function copies a manifest file that contains only ADDED entries,
+/// rewriting it with a new snapshot ID. This is similar to Java's
+/// ManifestFiles.copyAppendManifest.
+///
+/// \param manifest The manifest file to copy
+/// \param file_io File IO implementation to use
+/// \param schema Table schema
+/// \param spec Partition spec for the manifest
+/// \param snapshot_id The new snapshot ID to assign to entries
+/// \param output_path Path where the new manifest will be written
+/// \param format_version Table format version
+/// \param summary_builder Optional summary builder to update with file metrics
+/// \return The copied manifest file, or an error
+ICEBERG_EXPORT Result<ManifestFile> CopyAppendManifest(
+    const ManifestFile& manifest, std::shared_ptr<FileIO> file_io,
+    std::shared_ptr<Schema> schema, std::shared_ptr<PartitionSpec> spec,

Review Comment:
   ```suggestion
       const ManifestFile& manifest, const std::shared_ptr<FileIO>& file_io,
       const std::shared_ptr<Schema>& schema, const 
std::shared_ptr<PartitionSpec>& spec,
   ```
   
   This can save an extra copy of them.



##########
src/iceberg/manifest/manifest_util_internal.cc:
##########
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/manifest/manifest_util_internal.h"
+
+#include <memory>
+#include <optional>
+
+#include "iceberg/inheritable_metadata.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/manifest/manifest_reader.h"
+#include "iceberg/manifest/manifest_writer.h"
+#include "iceberg/result.h"
+#include "iceberg/schema.h"
+#include "iceberg/snapshot.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+Result<ManifestFile> CopyAppendManifest(
+    const ManifestFile& manifest, std::shared_ptr<FileIO> file_io,
+    std::shared_ptr<Schema> schema, std::shared_ptr<PartitionSpec> spec,
+    int64_t snapshot_id, const std::string& output_path, int8_t format_version,
+    SnapshotSummaryBuilder* summary_builder) {
+  ICEBERG_ASSIGN_OR_RAISE(auto reader,
+                          ManifestReader::Make(manifest, file_io, schema, 
spec));

Review Comment:
   This is inconsistent with Java because the Java impl explicitly set 
`firstRowId` to null for reading manifest to clear it. However we respect 
`manifest.first_row_id` in the created reader. We might need to use another 
`ManifestReader::Make` for this purpose (and modify it to accept 
inheritable_metadata so we don't have to apply it here again).
   
   See 
https://github.com/apache/iceberg/blob/8967729beac20f1fbcbe6b1f43ae010525a2c6f5/core/src/main/java/org/apache/iceberg/ManifestFiles.java#L334-L338



##########
src/iceberg/util/content_file_util.h:
##########
@@ -35,6 +38,113 @@
 
 namespace iceberg {
 
+/// \brief Hash functor for std::shared_ptr<DataFile> based on file path.
+struct ICEBERG_EXPORT DataFilePtrHash {
+  size_t operator()(const std::shared_ptr<DataFile>& file) const {
+    if (!file) {
+      return 0;
+    }
+    return std::hash<std::string>{}(file->file_path);
+  }
+};
+
+/// \brief Equality functor for std::shared_ptr<DataFile> based on file path.
+struct ICEBERG_EXPORT DataFilePtrEqual {
+  bool operator()(const std::shared_ptr<DataFile>& left,
+                  const std::shared_ptr<DataFile>& right) const {
+    if (left == right) {
+      return true;
+    }
+    if (!left || !right) {
+      return false;
+    }
+    return left->file_path == right->file_path;
+  }
+};
+
+/// \brief A set of DataFile pointers, deduplicated by file path.
+///
+/// This preserves insertion order, which is important for row ID assignment 
in v3
+/// manifests. Similar to Java's DataFileSet which uses LinkedHashSet to 
maintain
+/// insertion order.
+class ICEBERG_EXPORT DataFileSet {

Review Comment:
   Should we do something like this to simplify implementation?
   
   ```cpp
   /// \brief A set of DataFile pointers with insertion order preserved and 
deduplicated by
   /// file path.
   class ICEBERG_EXPORT DataFileSet {
    public:
     using value_type = std::shared_ptr<DataFile>;
     using iterator = typename std::vector<value_type>::iterator;
     using const_iterator = typename std::vector<value_type>::const_iterator;
     using difference_type = typename std::vector<value_type>::difference_type;
   
     DataFileSet() = default;
   
     /// \brief Insert a data file into the set.
     /// \param file The data file to insert
     /// \return A pair with an iterator to the inserted element (or the 
existing one) and
     ///         a bool indicating whether insertion took place
     std::pair<iterator, bool> insert(const value_type& file) { return 
InsertImpl(file); }
   
     /// \brief Variant of insert that takes an rvalue reference.
     std::pair<iterator, bool> insert(value_type&& file) {
       return InsertImpl(std::move(file));
     }
   
     /// \brief Get the number of elements in the set.
     size_t size() const { return elements_.size(); }
   
     /// \brief Check if the set is empty.
     bool empty() const { return elements_.empty(); }
   
     /// \brief Clear all elements from the set.
     void clear() {
       elements_.clear();
       index_by_path_.clear();
     }
   
     /// \brief Get iterator to the beginning.
     iterator begin() { return elements_.begin(); }
     const_iterator begin() const { return elements_.begin(); }
     const_iterator cbegin() const { return elements_.cbegin(); }
   
     /// \brief Get iterator to the end.
     iterator end() { return elements_.end(); }
     const_iterator end() const { return elements_.end(); }
     const_iterator cend() const { return elements_.cend(); }
   
    private:
     std::pair<iterator, bool> InsertImpl(value_type file) {
       if (!file) {
         return {elements_.end(), false};
       }
   
       auto [index_iter, inserted] =
           index_by_path_.try_emplace(file->file_path, elements_.size());
       if (!inserted) {
         auto pos = static_cast<difference_type>(index_iter->second);
         return {elements_.begin() + pos, false};
       }
   
       elements_.push_back(std::move(file));
       return {std::prev(elements_.end()), true};
     }
   
     std::vector<value_type> elements_;
     std::unordered_map<std::string_view, size_t, StringHash, StringEqual> 
index_by_path_;
   };
   ```



##########
src/iceberg/test/fast_append_test.cc:
##########
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/update/fast_append.h"
+
+#include <format>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "iceberg/avro/avro_register.h"
+#include "iceberg/partition_spec.h"
+#include "iceberg/schema.h"
+#include "iceberg/table_metadata.h"
+#include "iceberg/test/matchers.h"
+#include "iceberg/test/test_resource.h"
+#include "iceberg/test/update_test_base.h"
+#include "iceberg/util/uuid.h"
+
+namespace iceberg {
+
+class FastAppendTest : public UpdateTestBase {
+ protected:
+  static void SetUpTestSuite() { avro::RegisterAll(); }
+
+  void SetUp() override {
+    UpdateTestBase::SetUp();
+
+    ASSERT_THAT(catalog_->DropTable(table_ident_, /*purge=*/false), IsOk());
+
+    auto metadata_location = std::format("{}/metadata/00001-{}.metadata.json",
+                                         table_location_, 
Uuid::GenerateV7().ToString());
+    ICEBERG_UNWRAP_OR_FAIL(
+        auto metadata, 
ReadTableMetadataFromResource("TableMetadataV2ValidMinimal.json"));
+    metadata->location = table_location_;
+    ASSERT_THAT(TableMetadataUtil::Write(*file_io_, metadata_location, 
*metadata),
+                IsOk());
+    ICEBERG_UNWRAP_OR_FAIL(table_,
+                           catalog_->RegisterTable(table_ident_, 
metadata_location));
+
+    // Get partition spec and schema from the base table
+    ICEBERG_UNWRAP_OR_FAIL(spec_, table_->spec());
+    ICEBERG_UNWRAP_OR_FAIL(schema_, table_->schema());
+
+    // Create test data files
+    file_a_ = CreateDataFile("/data/file_a.parquet", 100, 1024);
+    file_b_ = CreateDataFile("/data/file_b.parquet", 200, 2048);

Review Comment:
   ```suggestion
       file_a_ = CreateDataFile("/data/file_a.parquet", /*size=*/100, 
/*partition_value=*/1024);
       file_b_ = CreateDataFile("/data/file_b.parquet", /*size=*/200, 
/*partition_value=*/2048);
   ```



##########
src/iceberg/update/fast_append.cc:
##########
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/update/fast_append.h"
+
+#include <iterator>
+#include <ranges>
+#include <vector>
+
+#include "iceberg/constants.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/manifest/manifest_util_internal.h"
+#include "iceberg/snapshot.h"
+#include "iceberg/table.h"
+#include "iceberg/table_metadata.h"
+#include "iceberg/table_properties.h"
+#include "iceberg/transaction.h"
+#include "iceberg/util/error_collector.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+Result<std::unique_ptr<FastAppend>> FastAppend::Make(
+    std::string table_name, std::shared_ptr<Transaction> transaction) {
+  ICEBERG_PRECHECK(!table_name.empty(), "Table name cannot be empty");
+  ICEBERG_PRECHECK(transaction != nullptr,
+                   "Cannot create FastAppend without a transaction");
+  return std::unique_ptr<FastAppend>(
+      new FastAppend(std::move(table_name), std::move(transaction)));
+}
+
+FastAppend::FastAppend(std::string table_name, std::shared_ptr<Transaction> 
transaction)
+    : SnapshotUpdate(std::move(transaction)), 
table_name_(std::move(table_name)) {}
+
+FastAppend& FastAppend::AppendFile(std::shared_ptr<DataFile> file) {
+  ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid data file: null");
+  ICEBERG_BUILDER_CHECK(file->partition_spec_id.has_value(),
+                        "Data file must have partition spec ID");
+
+  int32_t spec_id = file->partition_spec_id.value();
+  ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto spec, Spec(spec_id));
+
+  auto& data_files = new_data_files_by_spec_[spec_id];
+  auto [iter, inserted] = data_files.insert(file);
+  if (inserted) {
+    has_new_files_ = true;
+    ICEBERG_BUILDER_RETURN_IF_ERROR(summary_.AddedFile(*spec, *file));
+  }
+
+  return *this;
+}
+
+FastAppend& FastAppend::AppendManifest(const ManifestFile& manifest) {
+  ICEBERG_BUILDER_CHECK(!manifest.has_existing_files(),
+                        "Cannot append manifest with existing files");
+  ICEBERG_BUILDER_CHECK(!manifest.has_deleted_files(),
+                        "Cannot append manifest with deleted files");
+  ICEBERG_BUILDER_CHECK(manifest.added_snapshot_id == kInvalidSnapshotId,
+                        "Snapshot id must be assigned during commit");
+  ICEBERG_BUILDER_CHECK(manifest.sequence_number == kInvalidSequenceNumber,
+                        "Sequence number must be assigned during commit");
+
+  if (can_inherit_snapshot_id() && manifest.added_snapshot_id == 
kInvalidSnapshotId) {
+    summary_.AddedManifest(manifest);
+    append_manifests_.push_back(manifest);
+  } else {
+    // The manifest must be rewritten with this update's snapshot ID
+    ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto copied_manifest, 
CopyManifest(manifest));
+    rewritten_append_manifests_.push_back(std::move(copied_manifest));
+  }
+
+  return *this;
+}
+
+FastAppend& FastAppend::ToBranch(const std::string& branch) {

Review Comment:
   Should we remove this function by directly using `SetTargetBranch`? It is 
duplicated.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to