Re: [PR] Core: Keep track of data files to be removed for orphaned DV detection [iceberg]

via GitHub Mon, 07 Jul 2025 09:31:41 -0700


amogh-jahagirdar commented on code in PR #13222:
URL: https://github.com/apache/iceberg/pull/13222#discussion_r2190574049



##########
core/src/jmh/java/org/apache/iceberg/RewriteDataFilesBenchmark.java:
##########
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg;
+
+import static org.apache.iceberg.types.Types.NestedField.required;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.io.LocationProvider;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.DataFileSet;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Timeout;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * A benchmark that evaluates the performance of rewriting data files in the 
table.
+ *
+ * <p>To run this benchmark: <code>
+ *   ./gradlew :iceberg-core:jmh
+ *       -PjmhIncludeRegex=RewriteDataFilesBenchmark
+ *       -PjmhOutputPath=benchmark/rewrite-data-files-benchmark.txt
+ * </code>
+ */
+@Fork(1)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3)
+@Measurement(iterations = 5)
+@BenchmarkMode(Mode.SingleShotTime)
+@Timeout(time = 10, timeUnit = TimeUnit.MINUTES)
+public class RewriteDataFilesBenchmark {
+
+  private static final String TABLE_IDENT = "tblX";
+  private static final Schema SCHEMA =
+      new Schema(
+          required(1, "int_col", Types.IntegerType.get()),
+          required(2, "long_col", Types.LongType.get()),
+          required(3, "decimal_col", Types.DecimalType.of(10, 10)),
+          required(4, "date_col", Types.DateType.get()),
+          required(5, "timestamp_col", Types.TimestampType.withoutZone()),
+          required(6, "timestamp_tz_col", Types.TimestampType.withZone()),
+          required(7, "str_col", Types.StringType.get()));
+  private static final PartitionSpec SPEC = PartitionSpec.unpartitioned();
+  private static final HadoopTables TABLES = new HadoopTables();
+
+  private Table table;
+  private DataFileSet dataFilesToRemove;
+  private DataFileSet dataFilesToAdd;
+
+  @Param({"50000", "100000", "500000", "1000000", "2000000"})
+  private int numFiles;
+
+  @Param({"5", "25", "50", "100"})
+  private int percentDataFilesRewritten;
+
+  @Setup
+  public void setupBenchmark() throws IOException {
+    initTable();
+    initFiles();
+  }
+
+  @TearDown
+  public void tearDownBenchmark() {
+    dropTable();
+  }
+
+  @Benchmark
+  @Threads(1)
+  public void rewriteDataFiles() {
+    Snapshot currentSnapshot = table.currentSnapshot();
+    RewriteFiles rewriteFiles = table.newRewrite();
+    rewriteFiles.validateFromSnapshot(currentSnapshot.snapshotId());
+    dataFilesToAdd.forEach(rewriteFiles::addFile);
+    dataFilesToRemove.forEach(rewriteFiles::deleteFile);
+    rewriteFiles.commit();
+    table.manageSnapshots().rollbackTo(currentSnapshot.snapshotId()).commit();
+  }
+
+  private void initTable() {
+    if (TABLES.exists(TABLE_IDENT)) {
+      TABLES.dropTable(TABLE_IDENT);
+    }
+
+    this.table =
+        TABLES.create(
+            SCHEMA, SPEC, ImmutableMap.of(TableProperties.FORMAT_VERSION, 
"3"), TABLE_IDENT);
+  }
+
+  private void dropTable() {
+    TABLES.dropTable(TABLE_IDENT);
+  }
+
+  private void initFiles() throws IOException {
+    List<DataFile> pendingDataFiles = 
Lists.newArrayListWithExpectedSize(numFiles);
+    int numDataFilesToRewrite = (int) Math.ceil(numFiles * 
(percentDataFilesRewritten / 100.0));
+    Map<String, DataFile> filesToReplace = 
Maps.newHashMapWithExpectedSize(numDataFilesToRewrite);
+    RowDelta rowDelta = table.newRowDelta();
+    for (int ordinal = 0; ordinal < numFiles; ordinal++) {
+      DataFile dataFile = generateDataFile();
+      rowDelta.addRows(dataFile);
+      DeleteFile deleteFile = FileGenerationUtil.generateDV(table, dataFile);
+      rowDelta.addDeletes(deleteFile);
+      if (numDataFilesToRewrite > 0) {
+        filesToReplace.put(dataFile.location(), dataFile);
+        DataFile pendingDataFile = generateDataFile(dataFile.recordCount());
+        rowDelta.addRows(pendingDataFile);
+        pendingDataFiles.add(pendingDataFile);
+        numDataFilesToRewrite--;
+      }
+    }
+
+    rowDelta.commit();
+
+    List<DataFile> dataFilesReadFromManifests = Lists.newArrayList();
+    for (ManifestFile deleteManifest : 
table.currentSnapshot().dataManifests(table.io())) {

Review Comment:
   Should the variable be called `dataManifest` instead of `deleteManifest`?



##########
core/src/main/java/org/apache/iceberg/ManifestFilterManager.java:
##########
@@ -488,6 +512,10 @@ private ManifestFile filterManifestWithDeletedFiles(
 
                     if (allRowsMatch) {
                       writer.delete(entry);
+                      F copyWithoutStats = file.copyWithoutStats();

Review Comment:
   Nit: I feel like since the variable is narrowly scoped, `fileCopy` is 
sufficient but up to you 



##########
core/src/main/java/org/apache/iceberg/ManifestFilterManager.java:
##########
@@ -488,6 +512,10 @@ private ManifestFile filterManifestWithDeletedFiles(
 
                     if (allRowsMatch) {
                       writer.delete(entry);
+                      F copyWithoutStats = file.copyWithoutStats();
+                      // add the file here in case it was deleted using an 
expression. The
+                      // DeleteManifestFilterManager will then remove its 
matching DV
+                      deleteFiles.add(copyWithoutStats);

Review Comment:
   This makes sense to me. If an entry is marked for delete or passes strict 
evaluation, then we're marking it for delete and we're adding to the tracking 
set of files for which we have to hunt down and cleanup orphans for. 



##########
core/src/main/java/org/apache/iceberg/ManifestFilterManager.java:
##########
@@ -468,14 +490,16 @@ private ManifestFile filterManifestWithDeletedFiles(
             .forEach(
                 entry -> {
                   F file = entry.file();
+                  boolean isDanglingDV = isDelete && isDanglingDV((DeleteFile) 
file);
                   boolean markedForDelete =
                       deletePaths.contains(file.location())
                           || deleteFiles.contains(file)
                           || dropPartitions.contains(file.specId(), 
file.partition())
                           || (isDelete
                               && entry.isLive()
                               && entry.dataSequenceNumber() > 0
-                              && entry.dataSequenceNumber() < 
minSequenceNumber);
+                              && entry.dataSequenceNumber() < 
minSequenceNumber)
+                          || isDanglingDV;

Review Comment:
   Nit: Could we move the `isDanglingDV` check to be first since that's a 
cheaper check to short circuit on?
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Core: Keep track of data files to be removed for orphaned DV detection [iceberg]

Reply via email to