This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new 75204b17 chore: refactor DataFileSet and make WriteManifests to accept 
span (#519)
75204b17 is described below

commit 75204b171563739f71bb0e4cc865c2c3c8c6b50e
Author: Junwang Zhao <[email protected]>
AuthorDate: Wed Jan 21 12:49:04 2026 +0800

    chore: refactor DataFileSet and make WriteManifests to accept span (#519)
    
    - Move DataFileSet out of content_file_util.h to data_file_set.h to
    reduce header dependencies
    - Refactor WriteDataManifests and WriteDeleteManifests to accept span
    instead of vector
---
 src/iceberg/test/CMakeLists.txt                    |   1 +
 src/iceberg/test/data_file_set_test.cc             | 284 +++++++++++++++++++++
 src/iceberg/test/meson.build                       |   1 +
 src/iceberg/update/fast_append.cc                  |   7 +-
 src/iceberg/update/fast_append.h                   |   2 +-
 src/iceberg/update/snapshot_update.cc              |  17 +-
 src/iceberg/update/snapshot_update.h               |  12 +-
 src/iceberg/util/content_file_util.h               |  68 -----
 .../util/{content_file_util.h => data_file_set.h}  |  42 +--
 src/iceberg/util/meson.build                       |   1 +
 10 files changed, 313 insertions(+), 122 deletions(-)

diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt
index 4c1679b7..62864cd6 100644
--- a/src/iceberg/test/CMakeLists.txt
+++ b/src/iceberg/test/CMakeLists.txt
@@ -109,6 +109,7 @@ add_iceberg_test(util_test
                  SOURCES
                  bucket_util_test.cc
                  config_test.cc
+                 data_file_set_test.cc
                  decimal_test.cc
                  endian_test.cc
                  formatter_test.cc
diff --git a/src/iceberg/test/data_file_set_test.cc 
b/src/iceberg/test/data_file_set_test.cc
new file mode 100644
index 00000000..60539adf
--- /dev/null
+++ b/src/iceberg/test/data_file_set_test.cc
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/util/data_file_set.h"
+
+#include <gtest/gtest.h>
+
+#include "iceberg/file_format.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/row/partition_values.h"
+
+namespace iceberg {
+
+class DataFileSetTest : public ::testing::Test {
+ protected:
+  std::shared_ptr<DataFile> CreateDataFile(const std::string& path, int64_t 
size = 100) {
+    auto file = std::make_shared<DataFile>();
+    file->file_path = path;
+    file->file_format = FileFormatType::kParquet;
+    file->file_size_in_bytes = size;
+    file->record_count = 10;
+    file->content = DataFile::Content::kData;
+    return file;
+  }
+};
+
+TEST_F(DataFileSetTest, EmptySet) {
+  DataFileSet set;
+  EXPECT_TRUE(set.empty());
+  EXPECT_EQ(set.size(), 0);
+  EXPECT_EQ(set.begin(), set.end());
+  EXPECT_TRUE(set.as_span().empty());
+}
+
+TEST_F(DataFileSetTest, InsertSingleFile) {
+  DataFileSet set;
+  auto file = CreateDataFile("/path/to/file.parquet");
+
+  auto [iter, inserted] = set.insert(file);
+  EXPECT_TRUE(inserted);
+  EXPECT_EQ(*iter, file);
+  EXPECT_FALSE(set.empty());
+  EXPECT_EQ(set.size(), 1);
+}
+
+TEST_F(DataFileSetTest, InsertDuplicateFile) {
+  DataFileSet set;
+  auto file1 = CreateDataFile("/path/to/file.parquet");
+  auto file2 = CreateDataFile("/path/to/file.parquet");  // Same path
+
+  auto [iter1, inserted1] = set.insert(file1);
+  EXPECT_TRUE(inserted1);
+
+  auto [iter2, inserted2] = set.insert(file2);
+  EXPECT_FALSE(inserted2);
+  EXPECT_EQ(iter1, iter2);   // Should point to the same element
+  EXPECT_EQ(set.size(), 1);  // Should still be size 1
+}
+
+TEST_F(DataFileSetTest, InsertDifferentFiles) {
+  DataFileSet set;
+  auto file1 = CreateDataFile("/path/to/file1.parquet");
+  auto file2 = CreateDataFile("/path/to/file2.parquet");
+  auto file3 = CreateDataFile("/path/to/file3.parquet");
+
+  set.insert(file1);
+  set.insert(file2);
+  set.insert(file3);
+
+  EXPECT_EQ(set.size(), 3);
+  EXPECT_FALSE(set.empty());
+}
+
+TEST_F(DataFileSetTest, InsertionOrderPreserved) {
+  DataFileSet set;
+  auto file1 = CreateDataFile("/path/to/file1.parquet");
+  auto file2 = CreateDataFile("/path/to/file2.parquet");
+  auto file3 = CreateDataFile("/path/to/file3.parquet");
+
+  set.insert(file1);
+  set.insert(file2);
+  set.insert(file3);
+
+  // Iterate and verify order
+  std::vector<std::string> paths;
+  for (const auto& file : set) {
+    paths.push_back(file->file_path);
+  }
+
+  EXPECT_EQ(paths.size(), 3);
+  EXPECT_EQ(paths[0], "/path/to/file1.parquet");
+  EXPECT_EQ(paths[1], "/path/to/file2.parquet");
+  EXPECT_EQ(paths[2], "/path/to/file3.parquet");
+}
+
+TEST_F(DataFileSetTest, AsSpan) {
+  DataFileSet set;
+  EXPECT_TRUE(set.as_span().empty());
+
+  // Single element
+  auto file0 = CreateDataFile("/path/to/file0.parquet");
+  set.insert(file0);
+  {
+    auto span = set.as_span();
+    EXPECT_EQ(span.size(), 1);
+    EXPECT_EQ(span[0]->file_path, "/path/to/file0.parquet");
+    EXPECT_EQ(span[0], file0);  // Same pointer, span is a view
+  }
+
+  // Multiple elements
+  auto file1 = CreateDataFile("/path/to/file1.parquet");
+  auto file2 = CreateDataFile("/path/to/file2.parquet");
+  set.insert(file1);
+  set.insert(file2);
+
+  auto span = set.as_span();
+  EXPECT_EQ(span.size(), 3);
+  EXPECT_EQ(span[0]->file_path, "/path/to/file0.parquet");
+  EXPECT_EQ(span[1]->file_path, "/path/to/file1.parquet");
+  EXPECT_EQ(span[2]->file_path, "/path/to/file2.parquet");
+
+  // Span matches set iteration order and identity
+  size_t i = 0;
+  for (const auto& file : set) {
+    EXPECT_EQ(span[i], file) << "Span element " << i << " should match set 
iterator";
+    ++i;
+  }
+  EXPECT_EQ(i, span.size());
+
+  // Span works with range-for
+  i = 0;
+  for (const auto& file : span) {
+    EXPECT_EQ(file->file_path, span[i]->file_path);
+    ++i;
+  }
+  EXPECT_EQ(i, 3);
+
+  set.clear();
+  EXPECT_TRUE(set.as_span().empty());
+}
+
+TEST_F(DataFileSetTest, InsertDuplicatePreservesOrder) {
+  DataFileSet set;
+  auto file1 = CreateDataFile("/path/to/file1.parquet");
+  auto file2 = CreateDataFile("/path/to/file2.parquet");
+  auto file3 = CreateDataFile("/path/to/file1.parquet");  // Duplicate of file1
+
+  set.insert(file1);
+  set.insert(file2);
+  set.insert(file3);  // Should not insert, but order should be preserved
+
+  EXPECT_EQ(set.size(), 2);
+
+  std::vector<std::string> paths;
+  for (const auto& file : set) {
+    paths.push_back(file->file_path);
+  }
+
+  EXPECT_EQ(paths[0], "/path/to/file1.parquet");
+  EXPECT_EQ(paths[1], "/path/to/file2.parquet");
+}
+
+TEST_F(DataFileSetTest, InsertNullFile) {
+  DataFileSet set;
+  std::shared_ptr<DataFile> null_file = nullptr;
+
+  auto [iter, inserted] = set.insert(null_file);
+  EXPECT_FALSE(inserted);
+  EXPECT_EQ(iter, set.end());
+  EXPECT_TRUE(set.empty());
+  EXPECT_EQ(set.size(), 0);
+}
+
+TEST_F(DataFileSetTest, InsertMoveSemantics) {
+  DataFileSet set;
+  auto file1 = CreateDataFile("/path/to/file1.parquet");
+  auto file2 = CreateDataFile("/path/to/file2.parquet");
+
+  // Insert using move
+  auto [iter1, inserted1] = set.insert(std::move(file1));
+  EXPECT_TRUE(inserted1);
+  EXPECT_EQ(file1, nullptr);  // Should be moved
+
+  // Insert using copy
+  auto [iter2, inserted2] = set.insert(file2);
+  EXPECT_TRUE(inserted2);
+  EXPECT_NE(file2, nullptr);  // Should still be valid
+
+  EXPECT_EQ(set.size(), 2);
+}
+
+TEST_F(DataFileSetTest, Clear) {
+  DataFileSet set;
+  set.insert(CreateDataFile("/path/to/file1.parquet"));
+  set.insert(CreateDataFile("/path/to/file2.parquet"));
+
+  EXPECT_EQ(set.size(), 2);
+  set.clear();
+  EXPECT_TRUE(set.empty());
+  EXPECT_EQ(set.size(), 0);
+  EXPECT_EQ(set.begin(), set.end());
+}
+
+TEST_F(DataFileSetTest, IteratorOperations) {
+  DataFileSet set;
+  auto file1 = CreateDataFile("/path/to/file1.parquet");
+  auto file2 = CreateDataFile("/path/to/file2.parquet");
+  auto file3 = CreateDataFile("/path/to/file3.parquet");
+
+  set.insert(file1);
+  set.insert(file2);
+  set.insert(file3);
+
+  // Test const iterators
+  const auto& const_set = set;
+  EXPECT_NE(const_set.begin(), const_set.end());
+  EXPECT_NE(const_set.cbegin(), const_set.cend());
+
+  // Test iterator increment
+  auto it = set.begin();
+  EXPECT_EQ((*it)->file_path, "/path/to/file1.parquet");
+  ++it;
+  EXPECT_EQ((*it)->file_path, "/path/to/file2.parquet");
+  ++it;
+  EXPECT_EQ((*it)->file_path, "/path/to/file3.parquet");
+  ++it;
+  EXPECT_EQ(it, set.end());
+}
+
+TEST_F(DataFileSetTest, RangeBasedForLoop) {
+  DataFileSet set;
+  set.insert(CreateDataFile("/path/to/file1.parquet"));
+  set.insert(CreateDataFile("/path/to/file2.parquet"));
+  set.insert(CreateDataFile("/path/to/file3.parquet"));
+
+  int count = 0;
+  for (const auto& file : set) {
+    EXPECT_NE(file, nullptr);
+    ++count;
+  }
+  EXPECT_EQ(count, 3);
+}
+
+TEST_F(DataFileSetTest, CaseSensitivePaths) {
+  DataFileSet set;
+  auto file1 = CreateDataFile("/path/to/file.parquet");
+  auto file2 = CreateDataFile("/path/to/FILE.parquet");  // Different case
+
+  set.insert(file1);
+  set.insert(file2);
+
+  // Should be treated as different files
+  EXPECT_EQ(set.size(), 2);
+}
+
+TEST_F(DataFileSetTest, MultipleInsertsSameFile) {
+  DataFileSet set;
+  auto file = CreateDataFile("/path/to/file.parquet");
+
+  // Insert the same file multiple times
+  set.insert(file);
+  set.insert(file);
+  set.insert(file);
+
+  EXPECT_EQ(set.size(), 1);
+}
+
+}  // namespace iceberg
diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build
index 791340be..5e3007c4 100644
--- a/src/iceberg/test/meson.build
+++ b/src/iceberg/test/meson.build
@@ -84,6 +84,7 @@ iceberg_tests = {
         'sources': files(
             'bucket_util_test.cc',
             'config_test.cc',
+            'data_file_set_test.cc',
             'decimal_test.cc',
             'endian_test.cc',
             'formatter_test.cc',
diff --git a/src/iceberg/update/fast_append.cc 
b/src/iceberg/update/fast_append.cc
index c7f66f2f..3c132a40 100644
--- a/src/iceberg/update/fast_append.cc
+++ b/src/iceberg/update/fast_append.cc
@@ -20,7 +20,6 @@
 #include "iceberg/update/fast_append.h"
 
 #include <iterator>
-#include <ranges>
 #include <vector>
 
 #include "iceberg/constants.h"
@@ -198,10 +197,8 @@ Result<std::vector<ManifestFile>> 
FastAppend::WriteNewManifests() {
   if (new_manifests_.empty() && !new_data_files_by_spec_.empty()) {
     for (const auto& [spec_id, data_files] : new_data_files_by_spec_) {
       ICEBERG_ASSIGN_OR_RAISE(auto spec, Spec(spec_id));
-      std::vector<std::shared_ptr<DataFile>> files;
-      files.reserve(data_files.size());
-      std::ranges::copy(data_files, std::back_inserter(files));
-      ICEBERG_ASSIGN_OR_RAISE(auto written_manifests, 
WriteDataManifests(files, spec));
+      ICEBERG_ASSIGN_OR_RAISE(auto written_manifests,
+                              WriteDataManifests(data_files.as_span(), spec));
       new_manifests_.insert(new_manifests_.end(),
                             std::make_move_iterator(written_manifests.begin()),
                             std::make_move_iterator(written_manifests.end()));
diff --git a/src/iceberg/update/fast_append.h b/src/iceberg/update/fast_append.h
index 87887c74..7f5cbb09 100644
--- a/src/iceberg/update/fast_append.h
+++ b/src/iceberg/update/fast_append.h
@@ -30,7 +30,7 @@
 #include "iceberg/result.h"
 #include "iceberg/type_fwd.h"
 #include "iceberg/update/snapshot_update.h"
-#include "iceberg/util/content_file_util.h"
+#include "iceberg/util/data_file_set.h"
 
 namespace iceberg {
 
diff --git a/src/iceberg/update/snapshot_update.cc 
b/src/iceberg/update/snapshot_update.cc
index 2bbb2d50..38c5129f 100644
--- a/src/iceberg/update/snapshot_update.cc
+++ b/src/iceberg/update/snapshot_update.cc
@@ -30,7 +30,6 @@
 #include "iceberg/manifest/manifest_writer.h"
 #include "iceberg/manifest/rolling_manifest_writer.h"
 #include "iceberg/partition_summary_internal.h"
-#include "iceberg/snapshot.h"
 #include "iceberg/table.h"
 #include "iceberg/transaction.h"
 #include "iceberg/util/macros.h"
@@ -166,10 +165,10 @@ 
SnapshotUpdate::SnapshotUpdate(std::shared_ptr<Transaction> transaction)
 
 // TODO(xxx): write manifests in parallel
 Result<std::vector<ManifestFile>> SnapshotUpdate::WriteDataManifests(
-    const std::vector<std::shared_ptr<DataFile>>& data_files,
+    std::span<const std::shared_ptr<DataFile>> files,
     const std::shared_ptr<PartitionSpec>& spec,
     std::optional<int64_t> data_sequence_number) {
-  if (data_files.empty()) {
+  if (files.empty()) {
     return std::vector<ManifestFile>{};
   }
 
@@ -185,7 +184,7 @@ Result<std::vector<ManifestFile>> 
SnapshotUpdate::WriteDataManifests(
       },
       target_manifest_size_bytes_);
 
-  for (const auto& file : data_files) {
+  for (const auto& file : files) {
     ICEBERG_RETURN_UNEXPECTED(rolling_writer.WriteAddedEntry(file, 
data_sequence_number));
   }
   ICEBERG_RETURN_UNEXPECTED(rolling_writer.Close());
@@ -194,9 +193,9 @@ Result<std::vector<ManifestFile>> 
SnapshotUpdate::WriteDataManifests(
 
 // TODO(xxx): write manifests in parallel
 Result<std::vector<ManifestFile>> SnapshotUpdate::WriteDeleteManifests(
-    const std::vector<std::shared_ptr<DataFile>>& delete_files,
+    std::span<const std::shared_ptr<DataFile>> files,
     const std::shared_ptr<PartitionSpec>& spec) {
-  if (delete_files.empty()) {
+  if (files.empty()) {
     return std::vector<ManifestFile>{};
   }
 
@@ -211,9 +210,9 @@ Result<std::vector<ManifestFile>> 
SnapshotUpdate::WriteDeleteManifests(
       },
       target_manifest_size_bytes_);
 
-  for (const auto& file : delete_files) {
-    /// FIXME: Java impl wrap it with `PendingDeleteFile` and deals with
-    /// file->data_sequenece_number
+  for (const auto& file : files) {
+    // FIXME: Java impl wrap it with `PendingDeleteFile` and deals with
+    // file->data_sequence_number
     ICEBERG_RETURN_UNEXPECTED(rolling_writer.WriteAddedEntry(file));
   }
   ICEBERG_RETURN_UNEXPECTED(rolling_writer.Close());
diff --git a/src/iceberg/update/snapshot_update.h 
b/src/iceberg/update/snapshot_update.h
index f31327fc..12c3b19d 100644
--- a/src/iceberg/update/snapshot_update.h
+++ b/src/iceberg/update/snapshot_update.h
@@ -22,13 +22,13 @@
 #include <functional>
 #include <memory>
 #include <optional>
+#include <span>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
 #include "iceberg/iceberg_export.h"
-#include "iceberg/manifest/manifest_list.h"
 #include "iceberg/result.h"
 #include "iceberg/snapshot.h"
 #include "iceberg/type_fwd.h"
@@ -103,24 +103,22 @@ class ICEBERG_EXPORT SnapshotUpdate : public 
PendingUpdate {
 
   /// \brief Write data manifests for the given data files
   ///
-  /// \param data_files The data files to write
+  /// \param files Data files to write
   /// \param spec The partition spec to use
   /// \param data_sequence_number Optional data sequence number for the files
   /// \return A vector of manifest files
-  /// TODO(xxx): Change signature to accept iterator begin/end instead of 
vector to avoid
-  /// intermediate vector allocations (e.g., from DataFileSet)
   Result<std::vector<ManifestFile>> WriteDataManifests(
-      const std::vector<std::shared_ptr<DataFile>>& data_files,
+      std::span<const std::shared_ptr<DataFile>> files,
       const std::shared_ptr<PartitionSpec>& spec,
       std::optional<int64_t> data_sequence_number = std::nullopt);
 
   /// \brief Write delete manifests for the given delete files
   ///
-  /// \param delete_files The delete files to write
+  /// \param files Delete files to write
   /// \param spec The partition spec to use
   /// \return A vector of manifest files
   Result<std::vector<ManifestFile>> WriteDeleteManifests(
-      const std::vector<std::shared_ptr<DataFile>>& delete_files,
+      std::span<const std::shared_ptr<DataFile>> files,
       const std::shared_ptr<PartitionSpec>& spec);
 
   Status SetTargetBranch(const std::string& branch);
diff --git a/src/iceberg/util/content_file_util.h 
b/src/iceberg/util/content_file_util.h
index 95a8d634..f547716d 100644
--- a/src/iceberg/util/content_file_util.h
+++ b/src/iceberg/util/content_file_util.h
@@ -22,12 +22,10 @@
 /// \file iceberg/util/content_file_util.h
 /// Utility functions for content files (data files and delete files).
 
-#include <memory>
 #include <optional>
 #include <span>
 #include <string>
 #include <unordered_set>
-#include <vector>
 
 #include "iceberg/iceberg_export.h"
 #include "iceberg/manifest/manifest_entry.h"
@@ -36,72 +34,6 @@
 
 namespace iceberg {
 
-/// \brief A set of DataFile pointers with insertion order preserved and 
deduplicated by
-/// file path.
-class ICEBERG_EXPORT DataFileSet {
- public:
-  using value_type = std::shared_ptr<DataFile>;
-  using iterator = typename std::vector<value_type>::iterator;
-  using const_iterator = typename std::vector<value_type>::const_iterator;
-  using difference_type = typename std::vector<value_type>::difference_type;
-
-  DataFileSet() = default;
-
-  /// \brief Insert a data file into the set.
-  /// \param file The data file to insert
-  /// \return A pair with an iterator to the inserted element (or the existing 
one) and
-  ///         a bool indicating whether insertion took place
-  std::pair<iterator, bool> insert(const value_type& file) { return 
InsertImpl(file); }
-
-  /// \brief Insert a data file into the set (move version).
-  std::pair<iterator, bool> insert(value_type&& file) {
-    return InsertImpl(std::move(file));
-  }
-
-  /// \brief Get the number of elements in the set.
-  size_t size() const { return elements_.size(); }
-
-  /// \brief Check if the set is empty.
-  bool empty() const { return elements_.empty(); }
-
-  /// \brief Clear all elements from the set.
-  void clear() {
-    elements_.clear();
-    index_by_path_.clear();
-  }
-
-  /// \brief Get iterator to the beginning.
-  iterator begin() { return elements_.begin(); }
-  const_iterator begin() const { return elements_.begin(); }
-  const_iterator cbegin() const { return elements_.cbegin(); }
-
-  /// \brief Get iterator to the end.
-  iterator end() { return elements_.end(); }
-  const_iterator end() const { return elements_.end(); }
-  const_iterator cend() const { return elements_.cend(); }
-
- private:
-  std::pair<iterator, bool> InsertImpl(value_type file) {
-    if (!file) {
-      return {elements_.end(), false};
-    }
-
-    auto [index_iter, inserted] =
-        index_by_path_.try_emplace(file->file_path, elements_.size());
-    if (!inserted) {
-      auto pos = static_cast<difference_type>(index_iter->second);
-      return {elements_.begin() + pos, false};
-    }
-
-    elements_.push_back(std::move(file));
-    return {std::prev(elements_.end()), true};
-  }
-
-  // Vector to preserve insertion order
-  std::vector<value_type> elements_;
-  std::unordered_map<std::string_view, size_t, StringHash, StringEqual> 
index_by_path_;
-};
-
 /// \brief Utility functions for content files.
 struct ICEBERG_EXPORT ContentFileUtil {
   /// \brief Check if a delete file is a deletion vector (DV).
diff --git a/src/iceberg/util/content_file_util.h 
b/src/iceberg/util/data_file_set.h
similarity index 72%
copy from src/iceberg/util/content_file_util.h
copy to src/iceberg/util/data_file_set.h
index 95a8d634..741b34e5 100644
--- a/src/iceberg/util/content_file_util.h
+++ b/src/iceberg/util/data_file_set.h
@@ -19,20 +19,20 @@
 
 #pragma once
 
-/// \file iceberg/util/content_file_util.h
-/// Utility functions for content files (data files and delete files).
+/// \file iceberg/util/data_file_set.h
+/// A set of DataFile pointers with insertion order preserved and deduplicated 
by file
+/// path.
 
+#include <iterator>
 #include <memory>
-#include <optional>
 #include <span>
-#include <string>
-#include <unordered_set>
+#include <string_view>
+#include <unordered_map>
 #include <vector>
 
 #include "iceberg/iceberg_export.h"
 #include "iceberg/manifest/manifest_entry.h"
-#include "iceberg/result.h"
-#include "iceberg/type_fwd.h"
+#include "iceberg/util/string_util.h"
 
 namespace iceberg {
 
@@ -80,6 +80,9 @@ class ICEBERG_EXPORT DataFileSet {
   const_iterator end() const { return elements_.end(); }
   const_iterator cend() const { return elements_.cend(); }
 
+  /// \brief Get a non-owning view of the data files in insertion order.
+  std::span<const value_type> as_span() const { return elements_; }
+
  private:
   std::pair<iterator, bool> InsertImpl(value_type file) {
     if (!file) {
@@ -102,29 +105,4 @@ class ICEBERG_EXPORT DataFileSet {
   std::unordered_map<std::string_view, size_t, StringHash, StringEqual> 
index_by_path_;
 };
 
-/// \brief Utility functions for content files.
-struct ICEBERG_EXPORT ContentFileUtil {
-  /// \brief Check if a delete file is a deletion vector (DV).
-  static bool IsDV(const DataFile& file);
-
-  /// \brief Get the referenced data file path from a position delete file.
-  static Result<std::optional<std::string>> ReferencedDataFile(const DataFile& 
file);
-
-  /// \brief Check if a delete file is file-scoped.
-  static Result<bool> IsFileScoped(const DataFile& file);
-
-  /// \brief Check if a collection of delete files contains exactly one DV.
-  static bool ContainsSingleDV(std::span<const std::shared_ptr<DataFile>> 
files);
-
-  /// \brief Generate a description string for a deletion vector.
-  static std::string DVDesc(const DataFile& file);
-
-  /// \brief In-place drop stats.
-  static void DropAllStats(DataFile& data_file);
-
-  /// \brief Preserve stats based on selected columns.
-  static void DropUnselectedStats(DataFile& data_file,
-                                  const std::unordered_set<int32_t>& 
selected_columns);
-};
-
 }  // namespace iceberg
diff --git a/src/iceberg/util/meson.build b/src/iceberg/util/meson.build
index 95952bb8..496a7575 100644
--- a/src/iceberg/util/meson.build
+++ b/src/iceberg/util/meson.build
@@ -22,6 +22,7 @@ install_headers(
         'config.h',
         'content_file_util.h',
         'conversions.h',
+        'data_file_set.h',
         'decimal.h',
         'endian.h',
         'error_collector.h',

Reply via email to