laserninja commented on code in PR #14354:
URL: https://github.com/apache/iceberg/pull/14354#discussion_r2441579309
##########
spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/ExpireSnapshotsSparkAction.java:
##########
@@ -282,4 +309,113 @@ private ExpireSnapshots.Result
deleteFiles(Iterator<FileInfo> files) {
.deletedStatisticsFilesCount(summary.statisticsFilesCount())
.build();
}
+
+ /**
+ * Logs which snapshots will be expired and the reasons for expiration. This
helps debug issues
+ * where more files than expected were deleted.
+ */
+ private void logSnapshotExpirationReasons(TableMetadata metadata) {
+ LOG.info("=== Snapshot Expiration Analysis ===");
+
+ // Determine snapshots that will be kept for retain_last calculation
+ Set<Long> snapshotsToKeep = Sets.newHashSet();
+ if (retainLastValue != null) {
+ List<Snapshot> sortedSnapshots =
+ metadata.snapshots().stream()
+ .sorted((s1, s2) -> Long.compare(s2.timestampMillis(),
s1.timestampMillis()))
+ .collect(Collectors.toList());
+
+ sortedSnapshots.stream()
+ .limit(retainLastValue)
+ .map(Snapshot::snapshotId)
+ .forEach(snapshotsToKeep::add);
+
+ LOG.info("Snapshots to keep (retain_last={}): {}", retainLastValue,
snapshotsToKeep);
+ }
+
+ // Analyze each snapshot and log expiration reason
+ for (Snapshot snapshot : metadata.snapshots()) {
+ long snapshotId = snapshot.snapshotId();
+ boolean willExpire = false;
+ String reason = "";
+
+ // Check explicit snapshot IDs
+ if (expiredSnapshotIds.contains(snapshotId)) {
+ willExpire = true;
+ reason = "explicitly marked for expiration";
+ }
+ // Check time-based criteria
+ else if (expireOlderThanValue != null && snapshot.timestampMillis() <
expireOlderThanValue) {
+ willExpire = true;
+ reason = "older than threshold (" + new
java.util.Date(expireOlderThanValue) + ")";
+ }
+ // Check count-based criteria
+ else if (retainLastValue != null &&
!snapshotsToKeep.contains(snapshotId)) {
+ willExpire = true;
+ reason = "beyond retain_last limit (" + retainLastValue + ")";
+ }
+
+ if (willExpire) {
+ LOG.info(
+ "WILL EXPIRE snapshot {} created at {} - reason: {}",
+ snapshotId,
+ new java.util.Date(snapshot.timestampMillis()),
+ reason);
+ }
+ }
+ }
+
+ /**
+ * Logs detailed information about files that will be deleted, organized by
snapshot. This
+ * provides the file-to-snapshot mapping requested for debugging.
+ */
+ private void logSnapshotDeletionSummary(
+ TableMetadata originalMetadata, Set<Long> deletedSnapshotIds) {
+ LOG.info("=== File Deletion Analysis by Snapshot ===");
+
+ for (Long snapshotId : deletedSnapshotIds) {
+ Set<Long> singleSnapshotSet = Sets.newHashSet(snapshotId);
+ Dataset<FileInfo> snapshotFiles = fileDS(originalMetadata,
singleSnapshotSet);
+
+ // Use memory-efficient streaming to avoid OOM with large snapshots
+ long totalFiles = snapshotFiles.count();
Review Comment:
Calling .count() on each snapshot might trigger a full dataset scan, right?
It can get expensive. Is it possible to use approximate counts or add a
timeout/limit mechanism?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]