Parth-Brahmbhatt commented on a change in pull request #388: Handle rollback in 
snapshot expiration
URL: https://github.com/apache/incubator-iceberg/pull/388#discussion_r317838958
 
 

 ##########
 File path: core/src/main/java/org/apache/iceberg/RemoveSnapshots.java
 ##########
 @@ -120,56 +123,128 @@ public void commit() {
           }
         });
 
-    LOG.info("Committed snapshot changes; cleaning up expired manifests and 
data files.");
+    cleanExpiredSnapshots();
+  }
 
+  private void cleanExpiredSnapshots() {
     // clean up the expired snapshots:
     // 1. Get a list of the snapshots that were removed
     // 2. Delete any data files that were deleted by those snapshots and are 
not in the table
     // 3. Delete any manifests that are no longer used by current snapshots
     // 4. Delete the manifest lists
 
+    TableMetadata current = ops.refresh();
+
+    Set<Long> validIds = Sets.newHashSet();
+    for (Snapshot snapshot : current.snapshots()) {
+      validIds.add(snapshot.snapshotId());
+    }
+
+    Set<Long> expiredIds = Sets.newHashSet();
+    for (Snapshot snapshot : base.snapshots()) {
+      long snapshotId = snapshot.snapshotId();
+      if (!validIds.contains(snapshotId)) {
+        // the snapshot was expired
+        LOG.info("Expired snapshot: {}", snapshot);
+        expiredIds.add(snapshotId);
+      }
+    }
+
+    if (expiredIds.isEmpty()) {
+      // if no snapshots were expired, skip cleanup
+      return;
+    }
+
+    LOG.info("Committed snapshot changes; cleaning up expired manifests and 
data files.");
+
+    cleanExpiredFiles(current.snapshots(), validIds, expiredIds);
+  }
+
+  @SuppressWarnings("checkstyle:CyclomaticComplexity")
+  private void cleanExpiredFiles(List<Snapshot> snapshots, Set<Long> validIds, 
Set<Long> expiredIds) {
     // Reads and deletes are done using 
Tasks.foreach(...).suppressFailureWhenFinished to complete
     // as much of the delete work as possible and avoid orphaned data or 
manifest files.
 
-    TableMetadata current = ops.refresh();
-    Set<Long> currentIds = Sets.newHashSet();
-    Set<ManifestFile> currentManifests = Sets.newHashSet();
-    for (Snapshot snapshot : current.snapshots()) {
-      currentIds.add(snapshot.snapshotId());
-      currentManifests.addAll(snapshot.manifests());
+    // this is the set of ancestors of the current table state. when removing 
snapshots, this must
+    // only remove files that were deleted in an ancestor of the current table 
state to avoid
+    // physically deleting files that were logically deleted in a commit that 
was rolled back.
+    Set<Long> ancestorIds = 
Sets.newHashSet(SnapshotUtil.ancestorIds(base.currentSnapshot(), 
base::snapshot));
+
+    Set<String> validManifests = Sets.newHashSet();
+    Set<String> manifestsToScan = Sets.newHashSet();
+    for (Snapshot snapshot : snapshots) {
+      validIds.add(snapshot.snapshotId());
 
 Review comment:
   this seems unnecessary, the passed in validIds should already  have these 
snapshotIds.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to