kgeisz commented on code in PR #6847:
URL: https://github.com/apache/hbase/pull/6847#discussion_r2103641903


##########
hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupCommands.java:
##########
@@ -853,6 +869,188 @@ protected void printUsage() {
     }
   }
 
+  /**
+   * The {@code CleanupCommand} class is responsible for removing Write-Ahead 
Log (WAL) and
+   * bulk-loaded files that are no longer needed for Point-in-Time Recovery 
(PITR).
+   * <p>
+   * The cleanup process follows these steps:
+   * <ol>
+   * <li>Identify the oldest full backup and its start timestamp.</li>
+   * <li>Delete WAL files older than this timestamp, as they are no longer 
usable for PITR with any
+   * backup.</li>
+   * </ol>
+   */
+  public static class CleanupCommand extends Command {
+    CleanupCommand(Configuration conf, CommandLine cmdline) {
+      super(conf);
+      this.cmdline = cmdline;
+    }
+
+    @Override
+    public void execute() throws IOException {
+      super.execute();
+
+      // Validate input arguments
+      validateArguments();
+
+      Configuration conf = getConf() != null ? getConf() : 
HBaseConfiguration.create();
+      String backupWalDir = conf.get(CONF_CONTINUOUS_BACKUP_WAL_DIR);
+
+      if (backupWalDir == null || backupWalDir.isEmpty()) {
+        System.out
+          .println("WAL Directory is not specified for continuous backup. 
Nothing to clean!");
+        return;
+      }
+
+      try (final Connection conn = ConnectionFactory.createConnection(conf);
+        final BackupSystemTable sysTable = new BackupSystemTable(conn)) {
+
+        // Retrieve tables that are part of continuous backup
+        Map<TableName, Long> continuousBackupTables = 
sysTable.getContinuousBackupTableSet();
+        if (continuousBackupTables.isEmpty()) {
+          System.out.println("Continuous Backup is not enabled for any tables. 
Nothing to clean!");
+          return;
+        }
+
+        // Determine the earliest timestamp before which WAL files can be 
deleted
+        long cleanupCutoffTimestamp = determineCleanupCutoffTime(sysTable, 
continuousBackupTables);
+        if (cleanupCutoffTimestamp == 0) {
+          System.err.println("ERROR: No valid full backup found. Cleanup 
aborted.");
+          return;
+        }
+
+        // Update the continuous backup table's start time to match the cutoff 
time *before* actual
+        // cleanup.
+        // This is safe because even if the WAL cleanup fails later, we won't 
be accessing data
+        // older than
+        // the cutoff timestamp, ensuring consistency in what the system 
considers valid for
+        // recovery.
+        //
+        // If we did this the other way around—cleaning up first and updating 
the table afterward—
+        // a failure between these two steps could leave us in an inconsistent 
state where some WALs
+        // are already deleted, but the backup metadata still references them.
+        updateContinuousBackupTablesStartTime(sysTable, 
cleanupCutoffTimestamp);
+
+        // Perform WAL file cleanup
+        cleanupOldWALFiles(conf, backupWalDir, cleanupCutoffTimestamp);
+      }
+    }
+
+    /**
+     * Fetches the continuous backup tables from the system table and updates 
their start timestamps
+     * if the current start time is earlier than the given cutoff timestamp.
+     * @param sysTable               The backup system table from which 
continuous backup tables are
+     *                               retrieved and updated.
+     * @param cleanupCutoffTimestamp The cutoff timestamp before which WAL 
files can be deleted.
+     * @throws IOException If an error occurs while accessing the system table.
+     */
+    private void updateContinuousBackupTablesStartTime(BackupSystemTable 
sysTable,
+      long cleanupCutoffTimestamp) throws IOException {
+      Map<TableName, Long> continuousBackupTables = 
sysTable.getContinuousBackupTableSet();
+
+      // Identify tables that need updating
+      Set<TableName> tablesToUpdate = new HashSet<>();
+      for (Map.Entry<TableName, Long> entry : 
continuousBackupTables.entrySet()) {
+        TableName table = entry.getKey();
+        long startTimestamp = entry.getValue();
+
+        if (startTimestamp < cleanupCutoffTimestamp) {
+          tablesToUpdate.add(table);
+        }
+      }
+
+      // If no tables require updates, exit early
+      if (tablesToUpdate.isEmpty()) {
+        return;
+      }
+
+      // Perform the actual update in the system table
+      sysTable.updateContinuousBackupTableSet(tablesToUpdate, 
cleanupCutoffTimestamp);
+    }
+
+    private void validateArguments() throws IOException {
+      String[] args = cmdline == null ? null : cmdline.getArgs();
+      if (args != null && args.length > 1) {
+        System.err.println("ERROR: wrong number of arguments: " + args.length);
+        printUsage();
+        throw new IOException(INCORRECT_USAGE);
+      }
+    }
+
+    private long determineCleanupCutoffTime(BackupSystemTable sysTable,
+      Map<TableName, Long> backupTables) throws IOException {
+      List<BackupInfo> backupInfos = 
sysTable.getBackupInfos(BackupState.COMPLETE);
+      Collections.reverse(backupInfos); // Process from oldest to latest
+
+      for (BackupInfo backupInfo : backupInfos) {
+        if (BackupType.FULL.equals(backupInfo.getType())) {
+          return backupInfo.getStartTs();
+        }
+      }
+      return 0;
+    }
+
+    /**
+     * Cleans up old WAL and bulk-loaded files based on the determined cutoff 
timestamp.
+     */
+    private void cleanupOldWALFiles(Configuration conf, String backupWalDir, 
long cutoffTime)
+      throws IOException {
+      System.out.println("Starting WAL cleanup in backup directory: " + 
backupWalDir

Review Comment:
   @vinayakphegde, are you using `System.out.println()` in this file so output 
can be displayed to the user when they use `hbase` command line commands?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to