Parth-Brahmbhatt commented on a change in pull request #388: Handle rollback in snapshot expiration URL: https://github.com/apache/incubator-iceberg/pull/388#discussion_r317838958
########## File path: core/src/main/java/org/apache/iceberg/RemoveSnapshots.java ########## @@ -120,56 +123,128 @@ public void commit() { } }); - LOG.info("Committed snapshot changes; cleaning up expired manifests and data files."); + cleanExpiredSnapshots(); + } + private void cleanExpiredSnapshots() { // clean up the expired snapshots: // 1. Get a list of the snapshots that were removed // 2. Delete any data files that were deleted by those snapshots and are not in the table // 3. Delete any manifests that are no longer used by current snapshots // 4. Delete the manifest lists + TableMetadata current = ops.refresh(); + + Set<Long> validIds = Sets.newHashSet(); + for (Snapshot snapshot : current.snapshots()) { + validIds.add(snapshot.snapshotId()); + } + + Set<Long> expiredIds = Sets.newHashSet(); + for (Snapshot snapshot : base.snapshots()) { + long snapshotId = snapshot.snapshotId(); + if (!validIds.contains(snapshotId)) { + // the snapshot was expired + LOG.info("Expired snapshot: {}", snapshot); + expiredIds.add(snapshotId); + } + } + + if (expiredIds.isEmpty()) { + // if no snapshots were expired, skip cleanup + return; + } + + LOG.info("Committed snapshot changes; cleaning up expired manifests and data files."); + + cleanExpiredFiles(current.snapshots(), validIds, expiredIds); + } + + @SuppressWarnings("checkstyle:CyclomaticComplexity") + private void cleanExpiredFiles(List<Snapshot> snapshots, Set<Long> validIds, Set<Long> expiredIds) { // Reads and deletes are done using Tasks.foreach(...).suppressFailureWhenFinished to complete // as much of the delete work as possible and avoid orphaned data or manifest files. - TableMetadata current = ops.refresh(); - Set<Long> currentIds = Sets.newHashSet(); - Set<ManifestFile> currentManifests = Sets.newHashSet(); - for (Snapshot snapshot : current.snapshots()) { - currentIds.add(snapshot.snapshotId()); - currentManifests.addAll(snapshot.manifests()); + // this is the set of ancestors of the current table state. when removing snapshots, this must + // only remove files that were deleted in an ancestor of the current table state to avoid + // physically deleting files that were logically deleted in a commit that was rolled back. + Set<Long> ancestorIds = Sets.newHashSet(SnapshotUtil.ancestorIds(base.currentSnapshot(), base::snapshot)); + + Set<String> validManifests = Sets.newHashSet(); + Set<String> manifestsToScan = Sets.newHashSet(); + for (Snapshot snapshot : snapshots) { + validIds.add(snapshot.snapshotId()); Review comment: this seems unnecessary, the passed in validIds should already have these snapshotIds. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org