smengcl commented on code in PR #4490:
URL: https://github.com/apache/ozone/pull/4490#discussion_r1166982383


##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java:
##########
@@ -115,10 +128,24 @@ public final class OmSnapshotManager implements 
AutoCloseable {
   private static final String SNAP_DIFF_REPORT_TABLE_NAME =
       "snap-diff-report-table";
 
+  /**
+   * Contains all the snap diff job which can be purged either due to max
+   * allowed time is over, FAILED or REJECTED.
+   * |-------------------------------------------|
+   * |  KEY     |  VALUE                         |
+   * |-------------------------------------------|
+   * |  jobId   | noOfTotalEntriesInReportTable  |

Review Comment:
   ```suggestion
      * |  jobId   | numOfTotalEntriesInReportTable |
   ```



##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java:
##########
@@ -148,8 +177,12 @@ public OmSnapshotManager(OzoneManager ozoneManager) {
       snapDiffJobCf = getOrCreateColumnFamily(SNAP_DIFF_JOB_TABLE_NAME,
               columnFamilyDescriptors, columnFamilyHandles);

Review Comment:
   ```suggestion
             columnFamilyDescriptors, columnFamilyHandles);
   ```



##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/SnapshotDiffCleanupService.java:
##########
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.om.service;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
+import org.apache.hadoop.hdds.utils.BackgroundService;
+import org.apache.hadoop.hdds.utils.BackgroundTask;
+import org.apache.hadoop.hdds.utils.BackgroundTaskQueue;
+import org.apache.hadoop.hdds.utils.BackgroundTaskResult;
+import org.apache.hadoop.hdds.utils.db.CodecRegistry;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksDB;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksIterator;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedWriteBatch;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedWriteOptions;
+import org.apache.hadoop.ozone.om.OzoneManager;
+import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffJob;
+import org.rocksdb.ColumnFamilyHandle;
+import org.rocksdb.RocksDBException;
+
+import static org.apache.hadoop.ozone.om.OmSnapshotManager.DELIMITER;
+import static 
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.FAILED;
+import static 
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.REJECTED;
+
+/**
+ * Background service to clean-up snapDiff jobs which are stable and
+ * corresponding reports.
+ */
+public class SnapshotDiffCleanupService extends BackgroundService {
+  // Use only a single thread for Snapshot Diff cleanup.
+  // Multiple threads would read from the same table and can send deletion
+  // requests for same snapshot diff job multiple times.
+  private static final int SNAPSHOT_DIFF_CLEANUP_CORE_POOL_SIZE = 1;
+
+  private final AtomicBoolean suspended;
+  private final AtomicLong runCount;
+  private final AtomicLong successRunCount;
+  private final OzoneManager ozoneManager;
+  private final ManagedRocksDB db;
+  private final ColumnFamilyHandle snapDiffJobCfh;
+  private final ColumnFamilyHandle snapDiffPurgedJobCfh;
+  private final ColumnFamilyHandle snapDiffReportCfh;
+  private final CodecRegistry codecRegistry;
+
+  // TODO: [SNAPSHOT] Move this to config.
+  /**
+   * Maximum numbers of snapDiff jobs to be purged per clean-up task run.
+   */
+  private final long maxJobToPurgePerTask = 1000L;
+
+  // TODO: [SNAPSHOT] Move this to config.
+  /**
+   * Maximum time a snapDiff job and corresponding report will be persisted.
+   */
+  private final long maxAllowedTime = Duration.ofDays(7).toMillis();
+
+  @SuppressWarnings("parameternumber")
+  public SnapshotDiffCleanupService(long interval,
+                                    long serviceTimeout,
+                                    OzoneManager ozoneManager,
+                                    ManagedRocksDB db,
+                                    ColumnFamilyHandle snapDiffJobCfh,
+                                    ColumnFamilyHandle snapDiffPurgedJobCfh,
+                                    ColumnFamilyHandle snapDiffReportCfh,
+                                    CodecRegistry codecRegistry) {
+    super(SnapshotDiffCleanupService.class.getSimpleName(),
+        interval,
+        TimeUnit.MILLISECONDS, SNAPSHOT_DIFF_CLEANUP_CORE_POOL_SIZE,
+        serviceTimeout);
+    this.suspended = new AtomicBoolean(false);
+    this.runCount = new AtomicLong(0);
+    this.successRunCount = new AtomicLong(0);
+    this.ozoneManager = ozoneManager;
+    this.db = db;
+    this.snapDiffJobCfh = snapDiffJobCfh;
+    this.snapDiffPurgedJobCfh = snapDiffPurgedJobCfh;
+    this.snapDiffReportCfh = snapDiffReportCfh;
+    this.codecRegistry = codecRegistry;
+  }
+
+  @VisibleForTesting
+  public void run() {
+    // Remove the job report first and then move snapDiff job from active
+    // job table to purge job table. It is done this way, to not deal with
+    // synchronization of snapDiff request and clean up. If entries are
+    // moved from active job table to purge table first and then corresponding
+    // report is removed, there could be a case when snapDiff request saw a
+    // done job in active job table, but their report was deleted by clean up
+    // service.
+    // In clean report table first and them move jobs to purge table approach,
+    // assumption is that by the next cleanup run, there is no purged snapDiff
+    // job reading from report table.
+    removeOlderJobReport();
+    moveOldSnapDiffJobsToPurgeTable();
+  }
+
+  /**
+   * Moves the snapDiff jobs from snapDiffJobTable to purge table which are
+   * older than the allowed time or have FAILED or REJECTED status.
+   * For jobs older than {@link SnapshotDiffCleanupService#maxAllowedTime},
+   * we don't care if there are in QUEUED/IN_PROGRESS state.
+   * Reason, there could be stale QUEUED/IN_PROGRESS jobs because of OM
+   * crash or leader node becoming follower.
+   * The other assumption here is that no snapDiff job takes equal or more time
+   * than the {@link SnapshotDiffCleanupService#maxAllowedTime} for job's
+   * persistent.
+   */
+  private void moveOldSnapDiffJobsToPurgeTable() {
+    try (ManagedRocksIterator iterator =
+             new ManagedRocksIterator(db.get().newIterator(snapDiffJobCfh));
+         ManagedWriteBatch writeBatch = new ManagedWriteBatch();
+         ManagedWriteOptions writeOptions = new ManagedWriteOptions()) {
+      long currentTimeMillis = System.currentTimeMillis();
+      long purgeJobCount = 0;
+      iterator.get().seekToFirst();
+
+      while (iterator.get().isValid() && purgeJobCount < maxJobToPurgePerTask) 
{
+        byte[] keyBytes = iterator.get().key();
+        byte[] snapInfoBytes = iterator.get().value();
+        iterator.get().next();
+
+        SnapshotDiffJob snapDiffJob = codecRegistry.asObject(snapInfoBytes,
+            SnapshotDiffJob.class);
+
+        if (currentTimeMillis - snapDiffJob.getCreationTime() > maxAllowedTime
+            || snapDiffJob.getStatus() == FAILED
+            || snapDiffJob.getStatus() == REJECTED) {
+
+          writeBatch.put(snapDiffPurgedJobCfh,
+              codecRegistry.asRawData(snapDiffJob.getJobId()),
+              codecRegistry.asRawData(snapDiffJob.getTotalDiffEntries()));
+          writeBatch.delete(snapDiffJobCfh, keyBytes);
+          purgeJobCount++;
+        }
+      }
+
+      db.get().write(writeOptions, writeBatch);
+    } catch (IOException | RocksDBException e) {
+      throw new RuntimeException(e);
+    }

Review Comment:
   Add TODO to properly handle the exception later?



##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffJob.java:
##########
@@ -113,6 +121,61 @@ public void setForceFullDiff(boolean forceFullDiff) {
     this.forceFullDiff = forceFullDiff;
   }
 
+  public long getCreationTime() {
+    return creationTime;
+  }
+
+  public void setCreationTime(long creationTime) {
+    this.creationTime = creationTime;
+  }
+
+  public long getTotalDiffEntries() {
+    return totalDiffEntries;
+  }
+
+  public void setTotalDiffEntries(long totalDiffEntries) {
+    this.totalDiffEntries = totalDiffEntries;
+  }
+
+  @Override
+  public String toString() {
+    return "creationTime : " + creationTime +
+        ", jobId: " + jobId +
+        ", status: " + status +
+        ", volume: " + volume +
+        ", bucket: " + bucket +
+        ", fromSnapshot: " + fromSnapshot +
+        ", toSnapshot: " + toSnapshot +
+        ", forceFullDiff: " + forceFullDiff +
+        ", totalDiffEntries: " + totalDiffEntries;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (this == other) {
+      return true;
+    }
+
+    if (other instanceof  SnapshotDiffJob) {

Review Comment:
   ```suggestion
       if (other instanceof SnapshotDiffJob) {
   ```



##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java:
##########
@@ -167,8 +158,19 @@ public SnapshotDiffManager(ManagedRocksDB db,
         new ThreadPoolExecutor.CallerRunsPolicy()
     );
 
-    // TODO: [SNAPSHOT] Load jobs only if it is leader node.
-    //  It could a event-triggered form OM when node is leader and up.
+    // Ideally, loadJobsOnStartUp should run only on OM node, since SnapDiff
+    // is not HA currently and running this on all the nodes would be
+    // inefficient. Especially, when OM node restarts and loses its leadership.
+    // However, it is hard to determine if node is leader node because 
consensus
+    // happens inside Ratis. We can add something like Awaitility.wait() here
+    // but that is not full proof either.

Review Comment:
   I'm not sure I fully get this argument but you can check if the current OM 
is the leader or not with `OzoneManager#isLeaderReady`:
   
   
https://github.com/apache/ozone/blob/dd003040a41def491e8de003ef8539ce40854972/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java#L3947-L3957



##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/SnapshotDiffCleanupService.java:
##########
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.om.service;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
+import org.apache.hadoop.hdds.utils.BackgroundService;
+import org.apache.hadoop.hdds.utils.BackgroundTask;
+import org.apache.hadoop.hdds.utils.BackgroundTaskQueue;
+import org.apache.hadoop.hdds.utils.BackgroundTaskResult;
+import org.apache.hadoop.hdds.utils.db.CodecRegistry;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksDB;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksIterator;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedWriteBatch;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedWriteOptions;
+import org.apache.hadoop.ozone.om.OzoneManager;
+import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffJob;
+import org.rocksdb.ColumnFamilyHandle;
+import org.rocksdb.RocksDBException;
+
+import static org.apache.hadoop.ozone.om.OmSnapshotManager.DELIMITER;
+import static 
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.FAILED;
+import static 
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.REJECTED;
+
+/**
+ * Background service to clean-up snapDiff jobs which are stable and
+ * corresponding reports.
+ */
+public class SnapshotDiffCleanupService extends BackgroundService {
+  // Use only a single thread for Snapshot Diff cleanup.
+  // Multiple threads would read from the same table and can send deletion
+  // requests for same snapshot diff job multiple times.
+  private static final int SNAPSHOT_DIFF_CLEANUP_CORE_POOL_SIZE = 1;
+
+  private final AtomicBoolean suspended;
+  private final AtomicLong runCount;
+  private final AtomicLong successRunCount;
+  private final OzoneManager ozoneManager;
+  private final ManagedRocksDB db;
+  private final ColumnFamilyHandle snapDiffJobCfh;
+  private final ColumnFamilyHandle snapDiffPurgedJobCfh;
+  private final ColumnFamilyHandle snapDiffReportCfh;
+  private final CodecRegistry codecRegistry;
+
+  // TODO: [SNAPSHOT] Move this to config.
+  /**
+   * Maximum numbers of snapDiff jobs to be purged per clean-up task run.
+   */
+  private final long maxJobToPurgePerTask = 1000L;
+
+  // TODO: [SNAPSHOT] Move this to config.
+  /**
+   * Maximum time a snapDiff job and corresponding report will be persisted.
+   */
+  private final long maxAllowedTime = Duration.ofDays(7).toMillis();
+
+  @SuppressWarnings("parameternumber")
+  public SnapshotDiffCleanupService(long interval,
+                                    long serviceTimeout,
+                                    OzoneManager ozoneManager,
+                                    ManagedRocksDB db,
+                                    ColumnFamilyHandle snapDiffJobCfh,
+                                    ColumnFamilyHandle snapDiffPurgedJobCfh,
+                                    ColumnFamilyHandle snapDiffReportCfh,
+                                    CodecRegistry codecRegistry) {
+    super(SnapshotDiffCleanupService.class.getSimpleName(),
+        interval,
+        TimeUnit.MILLISECONDS, SNAPSHOT_DIFF_CLEANUP_CORE_POOL_SIZE,
+        serviceTimeout);
+    this.suspended = new AtomicBoolean(false);
+    this.runCount = new AtomicLong(0);
+    this.successRunCount = new AtomicLong(0);
+    this.ozoneManager = ozoneManager;
+    this.db = db;
+    this.snapDiffJobCfh = snapDiffJobCfh;
+    this.snapDiffPurgedJobCfh = snapDiffPurgedJobCfh;
+    this.snapDiffReportCfh = snapDiffReportCfh;
+    this.codecRegistry = codecRegistry;
+  }
+
+  @VisibleForTesting
+  public void run() {
+    // Remove the job report first and then move snapDiff job from active
+    // job table to purge job table. It is done this way, to not deal with
+    // synchronization of snapDiff request and clean up. If entries are
+    // moved from active job table to purge table first and then corresponding
+    // report is removed, there could be a case when snapDiff request saw a
+    // done job in active job table, but their report was deleted by clean up
+    // service.
+    // In clean report table first and them move jobs to purge table approach,
+    // assumption is that by the next cleanup run, there is no purged snapDiff
+    // job reading from report table.
+    removeOlderJobReport();
+    moveOldSnapDiffJobsToPurgeTable();
+  }
+
+  /**
+   * Moves the snapDiff jobs from snapDiffJobTable to purge table which are
+   * older than the allowed time or have FAILED or REJECTED status.
+   * For jobs older than {@link SnapshotDiffCleanupService#maxAllowedTime},
+   * we don't care if there are in QUEUED/IN_PROGRESS state.
+   * Reason, there could be stale QUEUED/IN_PROGRESS jobs because of OM
+   * crash or leader node becoming follower.
+   * The other assumption here is that no snapDiff job takes equal or more time
+   * than the {@link SnapshotDiffCleanupService#maxAllowedTime} for job's
+   * persistent.
+   */
+  private void moveOldSnapDiffJobsToPurgeTable() {

Review Comment:
   ```suggestion
     /**
      * Move the snapDiff jobs from snapDiffJobTable to purge table which are
      * older than the allowed time or have FAILED or REJECTED status.
      * For jobs older than {@link SnapshotDiffCleanupService#maxAllowedTime},
      * we don't care if they are in QUEUED/IN_PROGRESS state.
      * Reason: there could be stale QUEUED/IN_PROGRESS jobs because of OM
      * crashing or leader node becoming follower.
      * The other assumption here is that no snapDiff job takes equal or more 
time
      * than the {@link SnapshotDiffCleanupService#maxAllowedTime} for job's
      * persistence.
      */
     private void moveOldSnapDiffJobsToPurgeTable() {
   ```



##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/SnapshotDiffCleanupService.java:
##########
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.om.service;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
+import org.apache.hadoop.hdds.utils.BackgroundService;
+import org.apache.hadoop.hdds.utils.BackgroundTask;
+import org.apache.hadoop.hdds.utils.BackgroundTaskQueue;
+import org.apache.hadoop.hdds.utils.BackgroundTaskResult;
+import org.apache.hadoop.hdds.utils.db.CodecRegistry;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksDB;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksIterator;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedWriteBatch;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedWriteOptions;
+import org.apache.hadoop.ozone.om.OzoneManager;
+import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffJob;
+import org.rocksdb.ColumnFamilyHandle;
+import org.rocksdb.RocksDBException;
+
+import static org.apache.hadoop.ozone.om.OmSnapshotManager.DELIMITER;
+import static 
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.FAILED;
+import static 
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.REJECTED;
+
+/**
+ * Background service to clean-up snapDiff jobs which are stable and
+ * corresponding reports.
+ */
+public class SnapshotDiffCleanupService extends BackgroundService {
+  // Use only a single thread for Snapshot Diff cleanup.
+  // Multiple threads would read from the same table and can send deletion
+  // requests for same snapshot diff job multiple times.
+  private static final int SNAPSHOT_DIFF_CLEANUP_CORE_POOL_SIZE = 1;
+
+  private final AtomicBoolean suspended;
+  private final AtomicLong runCount;
+  private final AtomicLong successRunCount;
+  private final OzoneManager ozoneManager;
+  private final ManagedRocksDB db;
+  private final ColumnFamilyHandle snapDiffJobCfh;
+  private final ColumnFamilyHandle snapDiffPurgedJobCfh;
+  private final ColumnFamilyHandle snapDiffReportCfh;
+  private final CodecRegistry codecRegistry;
+
+  // TODO: [SNAPSHOT] Move this to config.
+  /**
+   * Maximum numbers of snapDiff jobs to be purged per clean-up task run.
+   */
+  private final long maxJobToPurgePerTask = 1000L;
+
+  // TODO: [SNAPSHOT] Move this to config.
+  /**
+   * Maximum time a snapDiff job and corresponding report will be persisted.
+   */
+  private final long maxAllowedTime = Duration.ofDays(7).toMillis();
+
+  @SuppressWarnings("parameternumber")
+  public SnapshotDiffCleanupService(long interval,
+                                    long serviceTimeout,
+                                    OzoneManager ozoneManager,
+                                    ManagedRocksDB db,
+                                    ColumnFamilyHandle snapDiffJobCfh,
+                                    ColumnFamilyHandle snapDiffPurgedJobCfh,
+                                    ColumnFamilyHandle snapDiffReportCfh,
+                                    CodecRegistry codecRegistry) {
+    super(SnapshotDiffCleanupService.class.getSimpleName(),
+        interval,
+        TimeUnit.MILLISECONDS, SNAPSHOT_DIFF_CLEANUP_CORE_POOL_SIZE,
+        serviceTimeout);
+    this.suspended = new AtomicBoolean(false);
+    this.runCount = new AtomicLong(0);
+    this.successRunCount = new AtomicLong(0);
+    this.ozoneManager = ozoneManager;
+    this.db = db;
+    this.snapDiffJobCfh = snapDiffJobCfh;
+    this.snapDiffPurgedJobCfh = snapDiffPurgedJobCfh;
+    this.snapDiffReportCfh = snapDiffReportCfh;
+    this.codecRegistry = codecRegistry;
+  }
+
+  @VisibleForTesting
+  public void run() {
+    // Remove the job report first and then move snapDiff job from active
+    // job table to purge job table. It is done this way, to not deal with
+    // synchronization of snapDiff request and clean up. If entries are
+    // moved from active job table to purge table first and then corresponding
+    // report is removed, there could be a case when snapDiff request saw a
+    // done job in active job table, but their report was deleted by clean up
+    // service.
+    // In clean report table first and them move jobs to purge table approach,
+    // assumption is that by the next cleanup run, there is no purged snapDiff
+    // job reading from report table.
+    removeOlderJobReport();
+    moveOldSnapDiffJobsToPurgeTable();
+  }
+
+  /**
+   * Moves the snapDiff jobs from snapDiffJobTable to purge table which are
+   * older than the allowed time or have FAILED or REJECTED status.
+   * For jobs older than {@link SnapshotDiffCleanupService#maxAllowedTime},
+   * we don't care if there are in QUEUED/IN_PROGRESS state.
+   * Reason, there could be stale QUEUED/IN_PROGRESS jobs because of OM
+   * crash or leader node becoming follower.
+   * The other assumption here is that no snapDiff job takes equal or more time
+   * than the {@link SnapshotDiffCleanupService#maxAllowedTime} for job's
+   * persistent.
+   */
+  private void moveOldSnapDiffJobsToPurgeTable() {

Review Comment:
   I don't quite get the last part `for job's persistence`



##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java:
##########
@@ -167,8 +158,19 @@ public SnapshotDiffManager(ManagedRocksDB db,
         new ThreadPoolExecutor.CallerRunsPolicy()
     );
 
-    // TODO: [SNAPSHOT] Load jobs only if it is leader node.
-    //  It could a event-triggered form OM when node is leader and up.
+    // Ideally, loadJobsOnStartUp should run only on OM node, since SnapDiff
+    // is not HA currently and running this on all the nodes would be
+    // inefficient. Especially, when OM node restarts and loses its leadership.
+    // However, it is hard to determine if node is leader node because 
consensus
+    // happens inside Ratis. We can add something like Awaitility.wait() here
+    // but that is not full proof either.

Review Comment:
   ```suggestion
       // However, it is hard to determine if node is leader node because 
consensus
       // happens inside Ratis. We can add something like Awaitility.wait() here
       // but that is not fool proof either.
   ```



##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/SnapshotDiffCleanupService.java:
##########
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.om.service;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
+import org.apache.hadoop.hdds.utils.BackgroundService;
+import org.apache.hadoop.hdds.utils.BackgroundTask;
+import org.apache.hadoop.hdds.utils.BackgroundTaskQueue;
+import org.apache.hadoop.hdds.utils.BackgroundTaskResult;
+import org.apache.hadoop.hdds.utils.db.CodecRegistry;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksDB;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksIterator;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedWriteBatch;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedWriteOptions;
+import org.apache.hadoop.ozone.om.OzoneManager;
+import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffJob;
+import org.rocksdb.ColumnFamilyHandle;
+import org.rocksdb.RocksDBException;
+
+import static org.apache.hadoop.ozone.om.OmSnapshotManager.DELIMITER;
+import static 
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.FAILED;
+import static 
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.REJECTED;
+
+/**
+ * Background service to clean-up snapDiff jobs which are stable and
+ * corresponding reports.
+ */
+public class SnapshotDiffCleanupService extends BackgroundService {
+  // Use only a single thread for Snapshot Diff cleanup.
+  // Multiple threads would read from the same table and can send deletion
+  // requests for same snapshot diff job multiple times.
+  private static final int SNAPSHOT_DIFF_CLEANUP_CORE_POOL_SIZE = 1;
+
+  private final AtomicBoolean suspended;
+  private final AtomicLong runCount;
+  private final AtomicLong successRunCount;
+  private final OzoneManager ozoneManager;
+  private final ManagedRocksDB db;
+  private final ColumnFamilyHandle snapDiffJobCfh;
+  private final ColumnFamilyHandle snapDiffPurgedJobCfh;
+  private final ColumnFamilyHandle snapDiffReportCfh;
+  private final CodecRegistry codecRegistry;
+
+  // TODO: [SNAPSHOT] Move this to config.
+  /**
+   * Maximum numbers of snapDiff jobs to be purged per clean-up task run.
+   */
+  private final long maxJobToPurgePerTask = 1000L;
+
+  // TODO: [SNAPSHOT] Move this to config.
+  /**
+   * Maximum time a snapDiff job and corresponding report will be persisted.
+   */
+  private final long maxAllowedTime = Duration.ofDays(7).toMillis();
+
+  @SuppressWarnings("parameternumber")
+  public SnapshotDiffCleanupService(long interval,
+                                    long serviceTimeout,
+                                    OzoneManager ozoneManager,
+                                    ManagedRocksDB db,
+                                    ColumnFamilyHandle snapDiffJobCfh,
+                                    ColumnFamilyHandle snapDiffPurgedJobCfh,
+                                    ColumnFamilyHandle snapDiffReportCfh,
+                                    CodecRegistry codecRegistry) {
+    super(SnapshotDiffCleanupService.class.getSimpleName(),
+        interval,
+        TimeUnit.MILLISECONDS, SNAPSHOT_DIFF_CLEANUP_CORE_POOL_SIZE,
+        serviceTimeout);
+    this.suspended = new AtomicBoolean(false);
+    this.runCount = new AtomicLong(0);
+    this.successRunCount = new AtomicLong(0);
+    this.ozoneManager = ozoneManager;
+    this.db = db;
+    this.snapDiffJobCfh = snapDiffJobCfh;
+    this.snapDiffPurgedJobCfh = snapDiffPurgedJobCfh;
+    this.snapDiffReportCfh = snapDiffReportCfh;
+    this.codecRegistry = codecRegistry;
+  }
+
+  @VisibleForTesting
+  public void run() {
+    // Remove the job report first and then move snapDiff job from active
+    // job table to purge job table. It is done this way, to not deal with
+    // synchronization of snapDiff request and clean up. If entries are
+    // moved from active job table to purge table first and then corresponding
+    // report is removed, there could be a case when snapDiff request saw a
+    // done job in active job table, but their report was deleted by clean up
+    // service.
+    // In clean report table first and them move jobs to purge table approach,
+    // assumption is that by the next cleanup run, there is no purged snapDiff
+    // job reading from report table.
+    removeOlderJobReport();
+    moveOldSnapDiffJobsToPurgeTable();
+  }
+
+  /**
+   * Moves the snapDiff jobs from snapDiffJobTable to purge table which are
+   * older than the allowed time or have FAILED or REJECTED status.
+   * For jobs older than {@link SnapshotDiffCleanupService#maxAllowedTime},
+   * we don't care if there are in QUEUED/IN_PROGRESS state.
+   * Reason, there could be stale QUEUED/IN_PROGRESS jobs because of OM
+   * crash or leader node becoming follower.
+   * The other assumption here is that no snapDiff job takes equal or more time
+   * than the {@link SnapshotDiffCleanupService#maxAllowedTime} for job's
+   * persistent.
+   */
+  private void moveOldSnapDiffJobsToPurgeTable() {
+    try (ManagedRocksIterator iterator =
+             new ManagedRocksIterator(db.get().newIterator(snapDiffJobCfh));
+         ManagedWriteBatch writeBatch = new ManagedWriteBatch();
+         ManagedWriteOptions writeOptions = new ManagedWriteOptions()) {
+      long currentTimeMillis = System.currentTimeMillis();
+      long purgeJobCount = 0;
+      iterator.get().seekToFirst();
+
+      while (iterator.get().isValid() && purgeJobCount < maxJobToPurgePerTask) 
{
+        byte[] keyBytes = iterator.get().key();
+        byte[] snapInfoBytes = iterator.get().value();
+        iterator.get().next();
+
+        SnapshotDiffJob snapDiffJob = codecRegistry.asObject(snapInfoBytes,
+            SnapshotDiffJob.class);
+
+        if (currentTimeMillis - snapDiffJob.getCreationTime() > maxAllowedTime
+            || snapDiffJob.getStatus() == FAILED
+            || snapDiffJob.getStatus() == REJECTED) {
+
+          writeBatch.put(snapDiffPurgedJobCfh,
+              codecRegistry.asRawData(snapDiffJob.getJobId()),
+              codecRegistry.asRawData(snapDiffJob.getTotalDiffEntries()));
+          writeBatch.delete(snapDiffJobCfh, keyBytes);
+          purgeJobCount++;
+        }
+      }
+
+      db.get().write(writeOptions, writeBatch);
+    } catch (IOException | RocksDBException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private void removeOlderJobReport() {
+    try (ManagedRocksIterator rocksIterator = new ManagedRocksIterator(
+        db.get().newIterator(snapDiffPurgedJobCfh));
+         ManagedWriteBatch writeBatch = new ManagedWriteBatch();
+         ManagedWriteOptions writeOptions = new ManagedWriteOptions()) {
+      rocksIterator.get().seekToFirst();
+      while (rocksIterator.get().isValid()) {
+        byte[] key = rocksIterator.get().key();
+        byte[] value = rocksIterator.get().value();
+        rocksIterator.get().next();
+        String prefix = codecRegistry.asObject(key, String.class);
+        long totalNumberOfEntries = codecRegistry.asObject(value, Long.class);
+
+        if (totalNumberOfEntries > 0) {
+          byte[] beginKey = codecRegistry.asRawData(prefix + DELIMITER + 0);
+          byte[] endKey = codecRegistry.asRawData(prefix + DELIMITER +
+              (totalNumberOfEntries - 1));
+          // Delete Range excludes the endKey.
+          // Hence, we do two delete,
+          //  1. deleteRange form beginKey(included) to endKey(excluded).
+          //  2. delete endKey.
+          writeBatch.deleteRange(snapDiffReportCfh, beginKey, endKey);
+          writeBatch.delete(snapDiffReportCfh, endKey);
+        }
+        // Finally, remove the entry from the purged job table.
+        writeBatch.delete(snapDiffPurgedJobCfh, key);
+      }
+      db.get().write(writeOptions, writeBatch);
+    } catch (IOException | RocksDBException e) {
+      throw new RuntimeException(e);
+    }

Review Comment:
   Same here



##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffJob.java:
##########
@@ -113,6 +121,61 @@ public void setForceFullDiff(boolean forceFullDiff) {
     this.forceFullDiff = forceFullDiff;
   }
 
+  public long getCreationTime() {
+    return creationTime;
+  }
+
+  public void setCreationTime(long creationTime) {
+    this.creationTime = creationTime;
+  }
+
+  public long getTotalDiffEntries() {
+    return totalDiffEntries;
+  }
+
+  public void setTotalDiffEntries(long totalDiffEntries) {
+    this.totalDiffEntries = totalDiffEntries;
+  }
+
+  @Override
+  public String toString() {
+    return "creationTime : " + creationTime +
+        ", jobId: " + jobId +
+        ", status: " + status +
+        ", volume: " + volume +
+        ", bucket: " + bucket +
+        ", fromSnapshot: " + fromSnapshot +
+        ", toSnapshot: " + toSnapshot +
+        ", forceFullDiff: " + forceFullDiff +
+        ", totalDiffEntries: " + totalDiffEntries;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (this == other) {
+      return true;
+    }
+
+    if (other instanceof  SnapshotDiffJob) {
+      SnapshotDiffJob otherJob = (SnapshotDiffJob) other;
+      return Objects.equals(this.creationTime, otherJob.creationTime) &&
+          Objects.equals(this.jobId, otherJob.jobId) &&
+          Objects.equals(this.status, otherJob.status) &&
+          Objects.equals(this.volume, otherJob.volume) &&
+          Objects.equals(this.bucket, otherJob.bucket) &&
+          Objects.equals(this.fromSnapshot, otherJob.fromSnapshot) &&
+          Objects.equals(this.toSnapshot, otherJob.toSnapshot) &&
+          Objects.equals(this.forceFullDiff, otherJob.forceFullDiff) &&
+          Objects.equals(this.totalDiffEntries, otherJob.totalDiffEntries);
+    }
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    return toString().hashCode();
+  }

Review Comment:
   Maybe hashing the relevant variables directly is more efficient? e.g. 
https://github.com/apache/ozone/blob/a05543b1b0776396844a82994dc4cbee4765f0ef/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ECReplicationConfig.java#L194-L197



##########
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/SnapshotDiffCleanupService.java:
##########
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.om.service;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
+import org.apache.hadoop.hdds.utils.BackgroundService;
+import org.apache.hadoop.hdds.utils.BackgroundTask;
+import org.apache.hadoop.hdds.utils.BackgroundTaskQueue;
+import org.apache.hadoop.hdds.utils.BackgroundTaskResult;
+import org.apache.hadoop.hdds.utils.db.CodecRegistry;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksDB;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksIterator;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedWriteBatch;
+import org.apache.hadoop.hdds.utils.db.managed.ManagedWriteOptions;
+import org.apache.hadoop.ozone.om.OzoneManager;
+import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffJob;
+import org.rocksdb.ColumnFamilyHandle;
+import org.rocksdb.RocksDBException;
+
+import static org.apache.hadoop.ozone.om.OmSnapshotManager.DELIMITER;
+import static 
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.FAILED;
+import static 
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.REJECTED;
+
+/**
+ * Background service to clean-up snapDiff jobs which are stable and
+ * corresponding reports.
+ */
+public class SnapshotDiffCleanupService extends BackgroundService {
+  // Use only a single thread for Snapshot Diff cleanup.
+  // Multiple threads would read from the same table and can send deletion
+  // requests for same snapshot diff job multiple times.
+  private static final int SNAPSHOT_DIFF_CLEANUP_CORE_POOL_SIZE = 1;
+
+  private final AtomicBoolean suspended;
+  private final AtomicLong runCount;
+  private final AtomicLong successRunCount;
+  private final OzoneManager ozoneManager;
+  private final ManagedRocksDB db;
+  private final ColumnFamilyHandle snapDiffJobCfh;
+  private final ColumnFamilyHandle snapDiffPurgedJobCfh;
+  private final ColumnFamilyHandle snapDiffReportCfh;
+  private final CodecRegistry codecRegistry;
+
+  // TODO: [SNAPSHOT] Move this to config.
+  /**
+   * Maximum numbers of snapDiff jobs to be purged per clean-up task run.
+   */
+  private final long maxJobToPurgePerTask = 1000L;
+
+  // TODO: [SNAPSHOT] Move this to config.
+  /**
+   * Maximum time a snapDiff job and corresponding report will be persisted.
+   */
+  private final long maxAllowedTime = Duration.ofDays(7).toMillis();
+
+  @SuppressWarnings("parameternumber")
+  public SnapshotDiffCleanupService(long interval,
+                                    long serviceTimeout,
+                                    OzoneManager ozoneManager,
+                                    ManagedRocksDB db,
+                                    ColumnFamilyHandle snapDiffJobCfh,
+                                    ColumnFamilyHandle snapDiffPurgedJobCfh,
+                                    ColumnFamilyHandle snapDiffReportCfh,
+                                    CodecRegistry codecRegistry) {
+    super(SnapshotDiffCleanupService.class.getSimpleName(),
+        interval,
+        TimeUnit.MILLISECONDS, SNAPSHOT_DIFF_CLEANUP_CORE_POOL_SIZE,
+        serviceTimeout);
+    this.suspended = new AtomicBoolean(false);
+    this.runCount = new AtomicLong(0);
+    this.successRunCount = new AtomicLong(0);
+    this.ozoneManager = ozoneManager;
+    this.db = db;
+    this.snapDiffJobCfh = snapDiffJobCfh;
+    this.snapDiffPurgedJobCfh = snapDiffPurgedJobCfh;
+    this.snapDiffReportCfh = snapDiffReportCfh;
+    this.codecRegistry = codecRegistry;
+  }
+
+  @VisibleForTesting
+  public void run() {
+    // Remove the job report first and then move snapDiff job from active
+    // job table to purge job table. It is done this way, to not deal with
+    // synchronization of snapDiff request and clean up. If entries are
+    // moved from active job table to purge table first and then corresponding
+    // report is removed, there could be a case when snapDiff request saw a
+    // done job in active job table, but their report was deleted by clean up
+    // service.
+    // In clean report table first and them move jobs to purge table approach,
+    // assumption is that by the next cleanup run, there is no purged snapDiff
+    // job reading from report table.
+    removeOlderJobReport();
+    moveOldSnapDiffJobsToPurgeTable();
+  }
+
+  /**
+   * Moves the snapDiff jobs from snapDiffJobTable to purge table which are
+   * older than the allowed time or have FAILED or REJECTED status.
+   * For jobs older than {@link SnapshotDiffCleanupService#maxAllowedTime},
+   * we don't care if there are in QUEUED/IN_PROGRESS state.
+   * Reason, there could be stale QUEUED/IN_PROGRESS jobs because of OM
+   * crash or leader node becoming follower.
+   * The other assumption here is that no snapDiff job takes equal or more time
+   * than the {@link SnapshotDiffCleanupService#maxAllowedTime} for job's
+   * persistent.
+   */
+  private void moveOldSnapDiffJobsToPurgeTable() {
+    try (ManagedRocksIterator iterator =
+             new ManagedRocksIterator(db.get().newIterator(snapDiffJobCfh));
+         ManagedWriteBatch writeBatch = new ManagedWriteBatch();
+         ManagedWriteOptions writeOptions = new ManagedWriteOptions()) {
+      long currentTimeMillis = System.currentTimeMillis();
+      long purgeJobCount = 0;
+      iterator.get().seekToFirst();
+
+      while (iterator.get().isValid() && purgeJobCount < maxJobToPurgePerTask) 
{
+        byte[] keyBytes = iterator.get().key();
+        byte[] snapInfoBytes = iterator.get().value();
+        iterator.get().next();
+
+        SnapshotDiffJob snapDiffJob = codecRegistry.asObject(snapInfoBytes,
+            SnapshotDiffJob.class);
+
+        if (currentTimeMillis - snapDiffJob.getCreationTime() > maxAllowedTime
+            || snapDiffJob.getStatus() == FAILED
+            || snapDiffJob.getStatus() == REJECTED) {
+
+          writeBatch.put(snapDiffPurgedJobCfh,
+              codecRegistry.asRawData(snapDiffJob.getJobId()),
+              codecRegistry.asRawData(snapDiffJob.getTotalDiffEntries()));
+          writeBatch.delete(snapDiffJobCfh, keyBytes);
+          purgeJobCount++;
+        }
+      }
+
+      db.get().write(writeOptions, writeBatch);
+    } catch (IOException | RocksDBException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private void removeOlderJobReport() {
+    try (ManagedRocksIterator rocksIterator = new ManagedRocksIterator(
+        db.get().newIterator(snapDiffPurgedJobCfh));
+         ManagedWriteBatch writeBatch = new ManagedWriteBatch();
+         ManagedWriteOptions writeOptions = new ManagedWriteOptions()) {
+      rocksIterator.get().seekToFirst();
+      while (rocksIterator.get().isValid()) {
+        byte[] key = rocksIterator.get().key();
+        byte[] value = rocksIterator.get().value();
+        rocksIterator.get().next();
+        String prefix = codecRegistry.asObject(key, String.class);
+        long totalNumberOfEntries = codecRegistry.asObject(value, Long.class);
+
+        if (totalNumberOfEntries > 0) {
+          byte[] beginKey = codecRegistry.asRawData(prefix + DELIMITER + 0);
+          byte[] endKey = codecRegistry.asRawData(prefix + DELIMITER +
+              (totalNumberOfEntries - 1));
+          // Delete Range excludes the endKey.
+          // Hence, we do two delete,
+          //  1. deleteRange form beginKey(included) to endKey(excluded).
+          //  2. delete endKey.
+          writeBatch.deleteRange(snapDiffReportCfh, beginKey, endKey);
+          writeBatch.delete(snapDiffReportCfh, endKey);

Review Comment:
   nice. glad to see `deleteRange` being put to use here.
   
   Could further abstract this into `deletePrefix` later in a refactoring jira.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to