Copilot commented on code in PR #9981:
URL: https://github.com/apache/ozone/pull/9981#discussion_r3076620718


##########
hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/ozone/admin/scm/GetDeletedBlockSummarySubcommand.java:
##########
@@ -18,37 +18,196 @@
 package org.apache.hadoop.ozone.admin.scm;
 
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Scanner;
 import org.apache.hadoop.hdds.cli.HddsVersionProvider;
-import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionSummary;
+import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos
+    .RepairDeletedBlocksTxnSummaryFromCheckpointResponseProto;
 import org.apache.hadoop.hdds.scm.cli.ScmSubcommand;
 import org.apache.hadoop.hdds.scm.client.ScmClient;
 import picocli.CommandLine;
 
 /**
  * Handler of getting deleted blocks summary from SCM side.
+ *
+ * <ul>
+ *   <li><b>(no flag)</b> — print in-memory summary.</li>
+ *   <li><b>--detail</b> — take a RocksDB checkpoint and show Checkpoint-Actual
+ *       vs Checkpoint-Persisted for each dimension.</li>
+ *   <li><b>--repair</b> — same as --detail, then compute the diff, ask the
+ *       operator to confirm, and apply it to the live in-memory counters.</li>
+ * </ul>
  */
 @CommandLine.Command(
     name = "summary",
-    description = "get DeletedBlocksTransaction summary",
+    description = "Get DeletedBlocksTransaction summary",
     mixinStandardHelpOptions = true,
     versionProvider = HddsVersionProvider.class)
 public class GetDeletedBlockSummarySubcommand extends ScmSubcommand {
 
+  @CommandLine.Option(names = {"--detail"},
+      description = "Take a RocksDB checkpoint and compare the actual row "
+          + "counts in the checkpoint with the persisted summary stored in the 
"
+          + "same checkpoint.")
+  private boolean detail = false;
+
+  @CommandLine.Option(names = {"--repair"},
+      description = "Same as --detail: shows the diff between 
checkpoint-actual "
+          + "and checkpoint-persisted, then prompts the operator before 
applying "
+          + "the diff to the live in-memory counters.")
+  private boolean repair = false;
+
   @Override
   public void execute(ScmClient client) throws IOException {
-    HddsProtos.DeletedBlocksTransactionSummary summary = 
client.getDeletedBlockSummary();
-    if (summary == null) {
-      System.out.println("DeletedBlocksTransaction summary is not available");
+    if (repair) {
+      executeRepair(client);
+    } else if (detail) {
+      executeDetail(client);
     } else {
-      System.out.println("DeletedBlocksTransaction summary:");
-      System.out.println("  Total number of transactions: " +
-          summary.getTotalTransactionCount());
-      System.out.println("  Total number of blocks: " +
-          summary.getTotalBlockCount());
-      System.out.println("  Total size of blocks: " +
-          summary.getTotalBlockSize());
-      System.out.println("  Total replicated size of blocks: " +
-          summary.getTotalBlockReplicatedSize());
+      executeDefault(client);
+    }
+  }
+
+  private void executeDefault(ScmClient client) throws IOException {
+    DeletedBlocksTransactionSummary summary = client.getDeletedBlockSummary();
+    if (summary == null) {
+      System.out.println("DeletedBlocksTransaction summary is not available.");
+      return;
+    }
+    System.out.println("DeletedBlocksTransaction summary:");
+    System.out.println("Total number of transactions: " + 
summary.getTotalTransactionCount());
+    System.out.println("Total number of blocks: " + 
summary.getTotalBlockCount());
+    System.out.println("Total size of blocks: " + summary.getTotalBlockSize());
+    System.out.println("Total replicated size of blocks: " + 
summary.getTotalBlockReplicatedSize());
+  }
+
+  private void executeDetail(ScmClient client) throws IOException {
+    System.out.println("Taking RocksDB checkpoint and scanning deletedBlocks 
table ...");
+    RepairDeletedBlocksTxnSummaryFromCheckpointResponseProto resp =
+        client.getDeletedBlockSummaryFromCheckpoint();
+    printComparisonTable(resp);
+  }
+
+  private void executeRepair(ScmClient client) throws IOException {
+    System.out.println("Taking RocksDB checkpoint and scanning deletedBlocks 
table ...");
+    RepairDeletedBlocksTxnSummaryFromCheckpointResponseProto resp =
+        client.getDeletedBlockSummaryFromCheckpoint();
+    printComparisonTable(resp);
+
+    DeletedBlocksTransactionSummary actual =
+        resp.hasCheckpointActual() ? resp.getCheckpointActual() : null;
+    DeletedBlocksTransactionSummary persisted =
+        resp.hasCheckpointPersisted() ? resp.getCheckpointPersisted() : null;
+
+    if (!hasDiff(actual, persisted)) {
+      System.out.println("\nCheckpoint is consistent and nothing to repair.");
+      return;
+    }
+
+    System.out.println("\nDiff (Checkpoint-Actual minus 
Checkpoint-Persisted):");
+    printDiff(actual, persisted);
+
+    System.out.print("\nApply this diff to live in-memory counters? [y/N]: ");
+    System.out.flush();
+    String answer = new Scanner(System.in, 
StandardCharsets.UTF_8.name()).nextLine().trim();
+    if (!answer.equalsIgnoreCase("y") && !answer.equalsIgnoreCase("yes")) {
+      System.out.println("Aborted. No changes applied.");
+      return;
+    }
+
+    System.out.println("Applying diff ...");
+    RepairDeletedBlocksTxnSummaryFromCheckpointResponseProto repaired =
+        client.repairDeletedBlockSummaryFromCheckpoint();
+

Review Comment:
   In `--repair` mode the code prints a diff from 
`getDeletedBlockSummaryFromCheckpoint()`, but then calls 
`repairDeletedBlockSummaryFromCheckpoint()` which takes a fresh checkpoint on 
the server. If deletedBlocks changes between the two RPCs, the applied repair 
may not correspond to the diff the operator confirmed. Consider combining 
compare+repair into a single RPC (eg include a checkpoint token / expected 
persisted summary / computed diff in the repair request) so the repair is 
applied to the exact snapshot that was presented.



##########
hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/block/TestDeletedBlockLog.java:
##########
@@ -957,6 +957,194 @@ private void mockStandAloneContainerInfo(long 
containerID, DatanodeDetails dd)
         .thenReturn(replicaSet);
   }
 
+  @Test
+  public void testResetToSummary() {
+    SCMDeletedBlockTransactionStatusManager statusManager =
+        deletedBlockLog.getSCMDeletedBlockTransactionStatusManager();
+
+    HddsProtos.DeletedBlocksTransactionSummary newSummary =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(200)
+            .setTotalBlockCount(1000)
+            .setTotalBlockSize(5_000_000L)
+            .setTotalBlockReplicatedSize(15_000_000L)
+            .build();
+
+    statusManager.resetToSummary(newSummary);
+
+    HddsProtos.DeletedBlocksTransactionSummary result = 
statusManager.getTransactionSummary();
+    assertEquals(200, result.getTotalTransactionCount());
+    assertEquals(1000, result.getTotalBlockCount());
+    assertEquals(5_000_000L, result.getTotalBlockSize());
+    assertEquals(15_000_000L, result.getTotalBlockReplicatedSize());
+  }
+
+  @Test
+  public void testResetToSummaryZeroValues() {
+    SCMDeletedBlockTransactionStatusManager statusManager =
+        deletedBlockLog.getSCMDeletedBlockTransactionStatusManager();
+
+    HddsProtos.DeletedBlocksTransactionSummary zero =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(0)
+            .setTotalBlockCount(0)
+            .setTotalBlockSize(0)
+            .setTotalBlockReplicatedSize(0)
+            .build();
+
+    statusManager.resetToSummary(zero);
+
+    HddsProtos.DeletedBlocksTransactionSummary result = 
statusManager.getTransactionSummary();
+    assertEquals(0, result.getTotalTransactionCount());
+    assertEquals(0, result.getTotalBlockCount());
+    assertEquals(0, result.getTotalBlockSize());
+    assertEquals(0, result.getTotalBlockReplicatedSize());
+  }
+
+  @Test
+  public void testCheckpointSummaryResultGetters() {
+    HddsProtos.DeletedBlocksTransactionSummary cpActual =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(10).setTotalBlockCount(50).build();
+    HddsProtos.DeletedBlocksTransactionSummary cpPersisted =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(8).setTotalBlockCount(40).build();
+    HddsProtos.DeletedBlocksTransactionSummary liveBefore =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(12).setTotalBlockCount(60).build();
+    HddsProtos.DeletedBlocksTransactionSummary liveAfter =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(14).setTotalBlockCount(70).build();
+
+    DeletedBlockLog.CheckpointSummaryResult result =
+        new DeletedBlockLog.CheckpointSummaryResult(cpActual, cpPersisted, 
liveBefore, liveAfter);
+
+    assertEquals(cpActual, result.getCheckpointActual());
+    assertEquals(cpPersisted, result.getCheckpointPersisted());
+    assertEquals(liveBefore, result.getLiveInMemBefore());
+    assertEquals(liveAfter, result.getLiveInMemAfter());
+  }
+
+  @Test
+  public void testCheckpointSummaryResultAllowsNullFields() {
+    DeletedBlockLog.CheckpointSummaryResult result =
+        new DeletedBlockLog.CheckpointSummaryResult(null, null, null, null);
+
+    assertEquals(null, result.getCheckpointActual());
+    assertEquals(null, result.getCheckpointPersisted());
+    assertEquals(null, result.getLiveInMemBefore());
+    assertEquals(null, result.getLiveInMemAfter());
+  }
+
+  @Test
+  public void testGetTransactionSummaryFromCheckpointReturnsActualCounts()
+      throws Exception {
+    int txCount = 5;
+    addTransactions(generateData(txCount), true);
+
+    DeletedBlockLog.CheckpointSummaryResult result =
+        deletedBlockLog.getTransactionSummaryFromCheckpoint(false);
+
+    assertEquals(txCount, 
result.getCheckpointActual().getTotalTransactionCount());
+    assertTrue(result.getCheckpointActual().getTotalBlockCount() > 0);
+    assertTrue(result.getCheckpointActual().getTotalBlockSize() > 0);
+  }
+
+  @Test
+  public void testGetTransactionSummaryFromCheckpointConsistentState()
+      throws Exception {
+    int txCount = 4;
+    addTransactions(generateData(txCount), true);
+
+    DeletedBlockLog.CheckpointSummaryResult result =
+        deletedBlockLog.getTransactionSummaryFromCheckpoint(false);
+
+    HddsProtos.DeletedBlocksTransactionSummary actual = 
result.getCheckpointActual();
+    HddsProtos.DeletedBlocksTransactionSummary persisted = 
result.getCheckpointPersisted();
+
+    // After a clean add+flush, the persisted summary should match actual.
+    assertEquals(actual.getTotalTransactionCount(), 
persisted.getTotalTransactionCount());
+    assertEquals(actual.getTotalBlockCount(), persisted.getTotalBlockCount());
+    assertEquals(actual.getTotalBlockSize(), persisted.getTotalBlockSize());
+  }
+
+  @Test
+  public void testGetTransactionSummaryFromCheckpointLiveStateMatches()
+      throws Exception {
+    int txCount = 3;
+    addTransactions(generateData(txCount), true);
+
+    SCMDeletedBlockTransactionStatusManager statusManager =
+        deletedBlockLog.getSCMDeletedBlockTransactionStatusManager();
+    HddsProtos.DeletedBlocksTransactionSummary inMem = 
statusManager.getTransactionSummary();
+
+    DeletedBlockLog.CheckpointSummaryResult result =
+        deletedBlockLog.getTransactionSummaryFromCheckpoint(false);
+
+    // liveInMemBefore should match the in-memory state at the time of the 
call.
+    assertEquals(inMem.getTotalTransactionCount(),
+        result.getLiveInMemBefore().getTotalTransactionCount());
+    assertEquals(inMem.getTotalBlockCount(),
+        result.getLiveInMemBefore().getTotalBlockCount());
+  }
+
+  @Test
+  public void testGetTransactionSummaryFromCheckpointRepairConsistentState()
+      throws Exception {
+    int txCount = 6;
+    addTransactions(generateData(txCount), true);
+    DeletedBlockLog.CheckpointSummaryResult result =
+        deletedBlockLog.getTransactionSummaryFromCheckpoint(true);
+
+    // When the checkpoint is consistent (actual == persisted),
+    // repair should not change the live in-memory state.
+    assertEquals(
+        result.getLiveInMemBefore().getTotalTransactionCount(),
+        result.getLiveInMemAfter().getTotalTransactionCount());
+    assertEquals(
+        result.getLiveInMemBefore().getTotalBlockCount(),
+        result.getLiveInMemAfter().getTotalBlockCount());
+    assertEquals(
+        result.getLiveInMemBefore().getTotalBlockSize(),
+        result.getLiveInMemAfter().getTotalBlockSize());
+    assertEquals(
+        result.getLiveInMemBefore().getTotalBlockReplicatedSize(),
+        result.getLiveInMemAfter().getTotalBlockReplicatedSize());
+  }
+
+  @Test
+  public void 
testGetTransactionSummaryFromCheckpointRepairAppliedWhenDiffExists()
+      throws Exception {
+    int txCount = 8;
+    addTransactions(generateData(txCount), true);
+
+    // Corrupt the in-memory state to simulate a counter drift.
+    SCMDeletedBlockTransactionStatusManager statusManager =
+        deletedBlockLog.getSCMDeletedBlockTransactionStatusManager();
+    HddsProtos.DeletedBlocksTransactionSummary drifted =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(999)
+            .setTotalBlockCount(9999)
+            .setTotalBlockSize(999_999L)
+            .setTotalBlockReplicatedSize(9_999_999L)
+            .build();
+    statusManager.resetToSummary(drifted);
+    assertEquals(999, 
statusManager.getTransactionSummary().getTotalTransactionCount());
+
+    // The repair should detect: cpActual(8) != cpPersisted(8) -> no diff from 
checkpoint
+    // perspective, so liveInMemBefore != liveInMemAfter only when cpActual != 
cpPersisted.
+    // In this test the checkpoint is internally consistent (actual == 
persisted),
+    // so hasDiff == false and no repair is applied from checkpoint alone.

Review Comment:
   The comment says `cpActual(8) != cpPersisted(8)` but the intended point is 
that they are equal (both 8) and therefore diff==0. Please correct this comment 
to avoid confusion about the checkpoint-diff logic.
   ```suggestion
       // The repair should detect that cpActual(8) == cpPersisted(8), so there 
is
       // no diff from the checkpoint perspective. In this test the checkpoint 
is
       // internally consistent (actual == persisted), so hasDiff == false and 
no
       // repair is applied from checkpoint alone.
   ```



##########
hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/block/TestDeletedBlockLog.java:
##########
@@ -957,6 +957,194 @@ private void mockStandAloneContainerInfo(long 
containerID, DatanodeDetails dd)
         .thenReturn(replicaSet);
   }
 
+  @Test
+  public void testResetToSummary() {
+    SCMDeletedBlockTransactionStatusManager statusManager =
+        deletedBlockLog.getSCMDeletedBlockTransactionStatusManager();
+
+    HddsProtos.DeletedBlocksTransactionSummary newSummary =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(200)
+            .setTotalBlockCount(1000)
+            .setTotalBlockSize(5_000_000L)
+            .setTotalBlockReplicatedSize(15_000_000L)
+            .build();
+
+    statusManager.resetToSummary(newSummary);
+
+    HddsProtos.DeletedBlocksTransactionSummary result = 
statusManager.getTransactionSummary();
+    assertEquals(200, result.getTotalTransactionCount());
+    assertEquals(1000, result.getTotalBlockCount());
+    assertEquals(5_000_000L, result.getTotalBlockSize());
+    assertEquals(15_000_000L, result.getTotalBlockReplicatedSize());
+  }
+
+  @Test
+  public void testResetToSummaryZeroValues() {
+    SCMDeletedBlockTransactionStatusManager statusManager =
+        deletedBlockLog.getSCMDeletedBlockTransactionStatusManager();
+
+    HddsProtos.DeletedBlocksTransactionSummary zero =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(0)
+            .setTotalBlockCount(0)
+            .setTotalBlockSize(0)
+            .setTotalBlockReplicatedSize(0)
+            .build();
+
+    statusManager.resetToSummary(zero);
+
+    HddsProtos.DeletedBlocksTransactionSummary result = 
statusManager.getTransactionSummary();
+    assertEquals(0, result.getTotalTransactionCount());
+    assertEquals(0, result.getTotalBlockCount());
+    assertEquals(0, result.getTotalBlockSize());
+    assertEquals(0, result.getTotalBlockReplicatedSize());
+  }
+
+  @Test
+  public void testCheckpointSummaryResultGetters() {
+    HddsProtos.DeletedBlocksTransactionSummary cpActual =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(10).setTotalBlockCount(50).build();
+    HddsProtos.DeletedBlocksTransactionSummary cpPersisted =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(8).setTotalBlockCount(40).build();
+    HddsProtos.DeletedBlocksTransactionSummary liveBefore =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(12).setTotalBlockCount(60).build();
+    HddsProtos.DeletedBlocksTransactionSummary liveAfter =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(14).setTotalBlockCount(70).build();
+
+    DeletedBlockLog.CheckpointSummaryResult result =
+        new DeletedBlockLog.CheckpointSummaryResult(cpActual, cpPersisted, 
liveBefore, liveAfter);
+
+    assertEquals(cpActual, result.getCheckpointActual());
+    assertEquals(cpPersisted, result.getCheckpointPersisted());
+    assertEquals(liveBefore, result.getLiveInMemBefore());
+    assertEquals(liveAfter, result.getLiveInMemAfter());
+  }
+
+  @Test
+  public void testCheckpointSummaryResultAllowsNullFields() {
+    DeletedBlockLog.CheckpointSummaryResult result =
+        new DeletedBlockLog.CheckpointSummaryResult(null, null, null, null);
+
+    assertEquals(null, result.getCheckpointActual());
+    assertEquals(null, result.getCheckpointPersisted());
+    assertEquals(null, result.getLiveInMemBefore());
+    assertEquals(null, result.getLiveInMemAfter());
+  }
+
+  @Test
+  public void testGetTransactionSummaryFromCheckpointReturnsActualCounts()
+      throws Exception {
+    int txCount = 5;
+    addTransactions(generateData(txCount), true);
+
+    DeletedBlockLog.CheckpointSummaryResult result =
+        deletedBlockLog.getTransactionSummaryFromCheckpoint(false);
+
+    assertEquals(txCount, 
result.getCheckpointActual().getTotalTransactionCount());
+    assertTrue(result.getCheckpointActual().getTotalBlockCount() > 0);
+    assertTrue(result.getCheckpointActual().getTotalBlockSize() > 0);
+  }
+
+  @Test
+  public void testGetTransactionSummaryFromCheckpointConsistentState()
+      throws Exception {
+    int txCount = 4;
+    addTransactions(generateData(txCount), true);
+
+    DeletedBlockLog.CheckpointSummaryResult result =
+        deletedBlockLog.getTransactionSummaryFromCheckpoint(false);
+
+    HddsProtos.DeletedBlocksTransactionSummary actual = 
result.getCheckpointActual();
+    HddsProtos.DeletedBlocksTransactionSummary persisted = 
result.getCheckpointPersisted();
+
+    // After a clean add+flush, the persisted summary should match actual.
+    assertEquals(actual.getTotalTransactionCount(), 
persisted.getTotalTransactionCount());
+    assertEquals(actual.getTotalBlockCount(), persisted.getTotalBlockCount());
+    assertEquals(actual.getTotalBlockSize(), persisted.getTotalBlockSize());
+  }
+
+  @Test
+  public void testGetTransactionSummaryFromCheckpointLiveStateMatches()
+      throws Exception {
+    int txCount = 3;
+    addTransactions(generateData(txCount), true);
+
+    SCMDeletedBlockTransactionStatusManager statusManager =
+        deletedBlockLog.getSCMDeletedBlockTransactionStatusManager();
+    HddsProtos.DeletedBlocksTransactionSummary inMem = 
statusManager.getTransactionSummary();
+
+    DeletedBlockLog.CheckpointSummaryResult result =
+        deletedBlockLog.getTransactionSummaryFromCheckpoint(false);
+
+    // liveInMemBefore should match the in-memory state at the time of the 
call.
+    assertEquals(inMem.getTotalTransactionCount(),
+        result.getLiveInMemBefore().getTotalTransactionCount());
+    assertEquals(inMem.getTotalBlockCount(),
+        result.getLiveInMemBefore().getTotalBlockCount());
+  }
+
+  @Test
+  public void testGetTransactionSummaryFromCheckpointRepairConsistentState()
+      throws Exception {
+    int txCount = 6;
+    addTransactions(generateData(txCount), true);
+    DeletedBlockLog.CheckpointSummaryResult result =
+        deletedBlockLog.getTransactionSummaryFromCheckpoint(true);
+
+    // When the checkpoint is consistent (actual == persisted),
+    // repair should not change the live in-memory state.
+    assertEquals(
+        result.getLiveInMemBefore().getTotalTransactionCount(),
+        result.getLiveInMemAfter().getTotalTransactionCount());
+    assertEquals(
+        result.getLiveInMemBefore().getTotalBlockCount(),
+        result.getLiveInMemAfter().getTotalBlockCount());
+    assertEquals(
+        result.getLiveInMemBefore().getTotalBlockSize(),
+        result.getLiveInMemAfter().getTotalBlockSize());
+    assertEquals(
+        result.getLiveInMemBefore().getTotalBlockReplicatedSize(),
+        result.getLiveInMemAfter().getTotalBlockReplicatedSize());
+  }
+
+  @Test
+  public void 
testGetTransactionSummaryFromCheckpointRepairAppliedWhenDiffExists()
+      throws Exception {
+    int txCount = 8;
+    addTransactions(generateData(txCount), true);
+
+    // Corrupt the in-memory state to simulate a counter drift.
+    SCMDeletedBlockTransactionStatusManager statusManager =
+        deletedBlockLog.getSCMDeletedBlockTransactionStatusManager();
+    HddsProtos.DeletedBlocksTransactionSummary drifted =
+        HddsProtos.DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(999)
+            .setTotalBlockCount(9999)
+            .setTotalBlockSize(999_999L)
+            .setTotalBlockReplicatedSize(9_999_999L)
+            .build();
+    statusManager.resetToSummary(drifted);
+    assertEquals(999, 
statusManager.getTransactionSummary().getTotalTransactionCount());
+
+    // The repair should detect: cpActual(8) != cpPersisted(8) -> no diff from 
checkpoint
+    // perspective, so liveInMemBefore != liveInMemAfter only when cpActual != 
cpPersisted.
+    // In this test the checkpoint is internally consistent (actual == 
persisted),
+    // so hasDiff == false and no repair is applied from checkpoint alone.
+    // This validates that resetToSummary and hasDiff work together correctly.
+    DeletedBlockLog.CheckpointSummaryResult result =
+        deletedBlockLog.getTransactionSummaryFromCheckpoint(true);
+
+    // liveInMemBefore should reflect our drifted value.
+    assertEquals(999, result.getLiveInMemBefore().getTotalTransactionCount());
+    // Since cpActual == cpPersisted (both 8), diff == 0, no repair applied.
+    assertEquals(999, result.getLiveInMemAfter().getTotalTransactionCount());
+  }

Review Comment:
   Test name and assertions don't match: 
`testGetTransactionSummaryFromCheckpointRepairAppliedWhenDiffExists` sets an 
in-memory drift but then asserts *no* repair is applied because `cpActual == 
cpPersisted`. Rename the test (or change setup) so the name reflects the 
behavior being validated.



##########
hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/ozone/admin/scm/GetDeletedBlockSummarySubcommand.java:
##########
@@ -18,37 +18,196 @@
 package org.apache.hadoop.ozone.admin.scm;
 
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Scanner;
 import org.apache.hadoop.hdds.cli.HddsVersionProvider;
-import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionSummary;
+import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos
+    .RepairDeletedBlocksTxnSummaryFromCheckpointResponseProto;
 import org.apache.hadoop.hdds.scm.cli.ScmSubcommand;
 import org.apache.hadoop.hdds.scm.client.ScmClient;
 import picocli.CommandLine;
 
 /**
  * Handler of getting deleted blocks summary from SCM side.
+ *
+ * <ul>
+ *   <li><b>(no flag)</b> — print in-memory summary.</li>
+ *   <li><b>--detail</b> — take a RocksDB checkpoint and show Checkpoint-Actual
+ *       vs Checkpoint-Persisted for each dimension.</li>
+ *   <li><b>--repair</b> — same as --detail, then compute the diff, ask the
+ *       operator to confirm, and apply it to the live in-memory counters.</li>
+ * </ul>
  */
 @CommandLine.Command(
     name = "summary",
-    description = "get DeletedBlocksTransaction summary",
+    description = "Get DeletedBlocksTransaction summary",
     mixinStandardHelpOptions = true,
     versionProvider = HddsVersionProvider.class)
 public class GetDeletedBlockSummarySubcommand extends ScmSubcommand {
 
+  @CommandLine.Option(names = {"--detail"},
+      description = "Take a RocksDB checkpoint and compare the actual row "
+          + "counts in the checkpoint with the persisted summary stored in the 
"
+          + "same checkpoint.")
+  private boolean detail = false;
+
+  @CommandLine.Option(names = {"--repair"},
+      description = "Same as --detail: shows the diff between 
checkpoint-actual "
+          + "and checkpoint-persisted, then prompts the operator before 
applying "
+          + "the diff to the live in-memory counters.")
+  private boolean repair = false;
+
   @Override
   public void execute(ScmClient client) throws IOException {
-    HddsProtos.DeletedBlocksTransactionSummary summary = 
client.getDeletedBlockSummary();
-    if (summary == null) {
-      System.out.println("DeletedBlocksTransaction summary is not available");
+    if (repair) {
+      executeRepair(client);
+    } else if (detail) {
+      executeDetail(client);
     } else {
-      System.out.println("DeletedBlocksTransaction summary:");
-      System.out.println("  Total number of transactions: " +
-          summary.getTotalTransactionCount());
-      System.out.println("  Total number of blocks: " +
-          summary.getTotalBlockCount());
-      System.out.println("  Total size of blocks: " +
-          summary.getTotalBlockSize());
-      System.out.println("  Total replicated size of blocks: " +
-          summary.getTotalBlockReplicatedSize());
+      executeDefault(client);
+    }

Review Comment:
   `--detail` and `--repair` can both be specified at the same time; the 
current implementation silently prioritizes `--repair`. Consider making these 
options mutually exclusive (eg via picocli ArgGroup/exclusive or an explicit 
validation in `execute`) and emitting a clear error message when both are set.



##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java:
##########
@@ -1004,6 +1006,69 @@ public DeletedBlocksTransactionSummary 
getDeletedBlockSummary() {
     }
   }
 
+  @Override
+  public RepairDeletedBlocksTxnSummaryFromCheckpointResponseProto
+      getDeletedBlockSummaryFromCheckpoint() throws IOException {
+    UserGroupInformation remoteUser = getRemoteUser();
+    final Map<String, String> auditMap = Maps.newHashMap();
+    auditMap.put("remoteUser", remoteUser.getUserName());
+    try {
+      getScm().checkAdminAccess(remoteUser, false);

Review Comment:
   `getDeletedBlockSummaryFromCheckpoint()` is a read operation but it calls 
`checkAdminAccess(remoteUser, false)`, which blocks read-only SCM admins. If 
the intent is to allow read-only admins to run `--detail`, pass `isRead=true` 
here (keep `false` for the repair RPC).
   ```suggestion
         getScm().checkAdminAccess(remoteUser, true);
   ```



##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java:
##########
@@ -484,6 +490,141 @@ public DeletedBlocksTransactionSummary 
getTransactionSummary() {
     return transactionStatusManager.getTransactionSummary();
   }
 
+  @Override
+  public CheckpointSummaryResult getTransactionSummaryFromCheckpoint(boolean 
repair)
+      throws IOException {
+    //Take a RocksDB checkpoint (hard-linked snapshot; fast)
+    DBStore store = scm.getScmMetadataStore().getStore();
+    DBCheckpoint checkpoint = store.getCheckpoint(false);
+    try {
+      //Open the checkpoint read-only
+      Path checkpointLocation = checkpoint.getCheckpointLocation();
+      Path checkpointFileName = checkpointLocation.getFileName();
+      if (checkpointFileName == null) {
+        throw new IOException(
+            "Cannot determine checkpoint directory name from path: " + 
checkpointLocation);
+      }
+      DBStore cpStore = DBStoreBuilder
+          .newBuilder(scm.getConfiguration(),
+              org.apache.hadoop.hdds.scm.metadata.SCMDBDefinition.get(),
+              checkpointFileName.toString(),
+              checkpointLocation.getParent())
+          .setOpenReadOnly(true)
+          .build();
+      try {
+        //Scan deletedBlocks table in checkpoint
+        // NOTE: deletingTxIDs is NOT applied here because this is a raw
+        // snapshot; those rows are physically present in the checkpoint.
+        long cpTxCount = 0, cpBlockCount = 0;
+        long cpBlockSize = 0, cpReplicatedSize = 0;
+
+        Table<Long, DeletedBlocksTransaction> cpDeletedTable =
+            
org.apache.hadoop.hdds.scm.metadata.SCMDBDefinition.DELETED_BLOCKS.getTable(cpStore);
+        try (Table.KeyValueIterator<Long, DeletedBlocksTransaction> iter =
+                 cpDeletedTable.iterator()) {
+          while (iter.hasNext()) {
+            DeletedBlocksTransaction tx = iter.next().getValue();
+            if (tx != null && tx.hasTotalBlockSize()) {
+              cpTxCount++;
+              cpBlockCount += tx.getLocalIDCount();
+              cpBlockSize  += tx.getTotalBlockSize();
+              if (tx.hasTotalBlockReplicatedSize()) {
+                cpReplicatedSize += tx.getTotalBlockReplicatedSize();
+              }
+            }

Review Comment:
   Checkpoint scan currently counts a transaction only when 
`tx.hasTotalBlockSize()`. `totalBlockSize` is optional in 
`DeletedBlocksTransaction` (eg when STORAGE_SPACE_DISTRIBUTION isn't finalized 
/ older OM writes), so this can undercount `cpTxCount`/`cpBlockCount` and 
produce incorrect comparisons/repairs. Count txns/blocks regardless, and treat 
missing size/replicatedSize as 0 when summing.



##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java:
##########
@@ -484,6 +490,141 @@ public DeletedBlocksTransactionSummary 
getTransactionSummary() {
     return transactionStatusManager.getTransactionSummary();
   }
 
+  @Override
+  public CheckpointSummaryResult getTransactionSummaryFromCheckpoint(boolean 
repair)
+      throws IOException {
+    //Take a RocksDB checkpoint (hard-linked snapshot; fast)
+    DBStore store = scm.getScmMetadataStore().getStore();
+    DBCheckpoint checkpoint = store.getCheckpoint(false);
+    try {
+      //Open the checkpoint read-only
+      Path checkpointLocation = checkpoint.getCheckpointLocation();
+      Path checkpointFileName = checkpointLocation.getFileName();
+      if (checkpointFileName == null) {
+        throw new IOException(
+            "Cannot determine checkpoint directory name from path: " + 
checkpointLocation);
+      }
+      DBStore cpStore = DBStoreBuilder
+          .newBuilder(scm.getConfiguration(),
+              org.apache.hadoop.hdds.scm.metadata.SCMDBDefinition.get(),
+              checkpointFileName.toString(),
+              checkpointLocation.getParent())
+          .setOpenReadOnly(true)
+          .build();
+      try {
+        //Scan deletedBlocks table in checkpoint
+        // NOTE: deletingTxIDs is NOT applied here because this is a raw
+        // snapshot; those rows are physically present in the checkpoint.
+        long cpTxCount = 0, cpBlockCount = 0;
+        long cpBlockSize = 0, cpReplicatedSize = 0;
+
+        Table<Long, DeletedBlocksTransaction> cpDeletedTable =
+            
org.apache.hadoop.hdds.scm.metadata.SCMDBDefinition.DELETED_BLOCKS.getTable(cpStore);
+        try (Table.KeyValueIterator<Long, DeletedBlocksTransaction> iter =
+                 cpDeletedTable.iterator()) {
+          while (iter.hasNext()) {
+            DeletedBlocksTransaction tx = iter.next().getValue();
+            if (tx != null && tx.hasTotalBlockSize()) {
+              cpTxCount++;
+              cpBlockCount += tx.getLocalIDCount();
+              cpBlockSize  += tx.getTotalBlockSize();
+              if (tx.hasTotalBlockReplicatedSize()) {
+                cpReplicatedSize += tx.getTotalBlockReplicatedSize();
+              }
+            }
+          }
+        }
+
+        //Read persisted summary from checkpoint
+        DeletedBlocksTransactionSummary cpPersisted = null;
+        Table<String, com.google.protobuf.ByteString> cpConfigTable =
+            
org.apache.hadoop.hdds.scm.metadata.SCMDBDefinition.STATEFUL_SERVICE_CONFIG.getTable(cpStore);
+        com.google.protobuf.ByteString bytes =
+            cpConfigTable.get(DeletedBlockLogStateManagerImpl.SERVICE_NAME);
+        if (bytes != null) {
+          cpPersisted = DeletedBlocksTransactionSummary.parseFrom(bytes);
+        }
+        if (cpPersisted == null) {
+          cpPersisted = DeletedBlocksTransactionSummary.newBuilder().build();
+        }
+
+        DeletedBlocksTransactionSummary cpActual =
+            DeletedBlocksTransactionSummary.newBuilder()
+                .setTotalTransactionCount(cpTxCount)
+                .setTotalBlockCount(cpBlockCount)
+                .setTotalBlockSize(cpBlockSize)
+                .setTotalBlockReplicatedSize(cpReplicatedSize)
+                .build();
+
+        //Capture live in-memory state before any repair
+        DeletedBlocksTransactionSummary liveInMemBefore;
+        DeletedBlocksTransactionSummary liveInMemAfter;
+        lock.lock();
+        try {
+          liveInMemBefore = transactionStatusManager.getTransactionSummary();
+
+          if (repair) {
+            // Diff = cpActual - cpPersisted; apply to live-in-memory.
+            long diffTx   = cpTxCount       - 
cpPersisted.getTotalTransactionCount();
+            long diffBlk  = cpBlockCount    - cpPersisted.getTotalBlockCount();
+            long diffSize = cpBlockSize     - cpPersisted.getTotalBlockSize();
+            long diffRepl = cpReplicatedSize - 
cpPersisted.getTotalBlockReplicatedSize();
+
+            boolean hasDiff = diffTx != 0 || diffBlk != 0 || diffSize != 0 || 
diffRepl != 0;
+            if (hasDiff) {
+              long newTx   = liveInMemBefore.getTotalTransactionCount()    + 
diffTx;
+              long newBlk  = liveInMemBefore.getTotalBlockCount()          + 
diffBlk;
+              long newSize = liveInMemBefore.getTotalBlockSize()           + 
diffSize;
+              long newRepl = liveInMemBefore.getTotalBlockReplicatedSize() + 
diffRepl;
+              // Clamp to zero to avoid negative counters from stale 
checkpoints.
+              newTx   = Math.max(0, newTx);
+              newBlk  = Math.max(0, newBlk);
+              newSize = Math.max(0, newSize);
+              newRepl = Math.max(0, newRepl);
+
+              LOG.warn("Checkpoint repair: cpActual txns={} blk={} size={} 
repl={} "
+                  + "vs cpPersisted txns={} blk={} size={} repl={} "
+                  + "— applying diff to live inMem (before: txns={} blk={} 
size={} repl={}).",
+                  cpTxCount, cpBlockCount, cpBlockSize, cpReplicatedSize,
+                  cpPersisted.getTotalTransactionCount(), 
cpPersisted.getTotalBlockCount(),
+                  cpPersisted.getTotalBlockSize(), 
cpPersisted.getTotalBlockReplicatedSize(),
+                  liveInMemBefore.getTotalTransactionCount(), 
liveInMemBefore.getTotalBlockCount(),
+                  liveInMemBefore.getTotalBlockSize(), 
liveInMemBefore.getTotalBlockReplicatedSize());
+
+              doRepair(newTx, newBlk, newSize, newRepl);
+            } else {
+              LOG.info("Checkpoint repair: checkpoint is consistent "
+                  + "(cpActual == cpPersisted), no changes applied to live 
state.");
+            }
+          }
+          liveInMemAfter = transactionStatusManager.getTransactionSummary();
+        } finally {
+          lock.unlock();
+        }
+
+        return new CheckpointSummaryResult(cpActual, cpPersisted,
+            liveInMemBefore, liveInMemAfter);
+      } finally {
+        cpStore.close();
+      }
+    } finally {
+      checkpoint.cleanupCheckpoint();
+    }
+  }
+
+  private void doRepair(long newTx, long newBlk, long newSize, long newRepl)
+      throws IOException {
+    DeletedBlocksTransactionSummary newSummary =
+        DeletedBlocksTransactionSummary.newBuilder()
+            .setTotalTransactionCount(newTx)
+            .setTotalBlockCount(newBlk)
+            .setTotalBlockSize(newSize)
+            .setTotalBlockReplicatedSize(newRepl)
+            .build();
+    transactionStatusManager.resetToSummary(newSummary);
+    deletedBlockLogStateManager.updateSummaryInDb(newSummary);

Review Comment:
   `doRepair()` resets the in-memory counters before persisting the corrected 
summary via `updateSummaryInDb()`. If the DB update throws (eg RocksDB/codec 
error), the method will exit with in-memory state changed but the persisted 
summary unchanged, potentially worsening drift. Persist first (or rollback on 
failure) so in-memory and DB remain consistent under errors.
   ```suggestion
       deletedBlockLogStateManager.updateSummaryInDb(newSummary);
       transactionStatusManager.resetToSummary(newSummary);
   ```



##########
hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/ozone/admin/scm/GetDeletedBlockSummarySubcommand.java:
##########
@@ -18,37 +18,196 @@
 package org.apache.hadoop.ozone.admin.scm;
 
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Scanner;
 import org.apache.hadoop.hdds.cli.HddsVersionProvider;
-import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionSummary;
+import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos
+    .RepairDeletedBlocksTxnSummaryFromCheckpointResponseProto;
 import org.apache.hadoop.hdds.scm.cli.ScmSubcommand;
 import org.apache.hadoop.hdds.scm.client.ScmClient;
 import picocli.CommandLine;
 
 /**
  * Handler of getting deleted blocks summary from SCM side.
+ *
+ * <ul>
+ *   <li><b>(no flag)</b> — print in-memory summary.</li>
+ *   <li><b>--detail</b> — take a RocksDB checkpoint and show Checkpoint-Actual
+ *       vs Checkpoint-Persisted for each dimension.</li>
+ *   <li><b>--repair</b> — same as --detail, then compute the diff, ask the
+ *       operator to confirm, and apply it to the live in-memory counters.</li>
+ * </ul>
  */
 @CommandLine.Command(
     name = "summary",
-    description = "get DeletedBlocksTransaction summary",
+    description = "Get DeletedBlocksTransaction summary",
     mixinStandardHelpOptions = true,
     versionProvider = HddsVersionProvider.class)
 public class GetDeletedBlockSummarySubcommand extends ScmSubcommand {
 
+  @CommandLine.Option(names = {"--detail"},
+      description = "Take a RocksDB checkpoint and compare the actual row "
+          + "counts in the checkpoint with the persisted summary stored in the 
"
+          + "same checkpoint.")
+  private boolean detail = false;
+
+  @CommandLine.Option(names = {"--repair"},
+      description = "Same as --detail: shows the diff between 
checkpoint-actual "
+          + "and checkpoint-persisted, then prompts the operator before 
applying "
+          + "the diff to the live in-memory counters.")
+  private boolean repair = false;
+
   @Override
   public void execute(ScmClient client) throws IOException {
-    HddsProtos.DeletedBlocksTransactionSummary summary = 
client.getDeletedBlockSummary();
-    if (summary == null) {
-      System.out.println("DeletedBlocksTransaction summary is not available");
+    if (repair) {
+      executeRepair(client);
+    } else if (detail) {
+      executeDetail(client);
     } else {
-      System.out.println("DeletedBlocksTransaction summary:");
-      System.out.println("  Total number of transactions: " +
-          summary.getTotalTransactionCount());
-      System.out.println("  Total number of blocks: " +
-          summary.getTotalBlockCount());
-      System.out.println("  Total size of blocks: " +
-          summary.getTotalBlockSize());
-      System.out.println("  Total replicated size of blocks: " +
-          summary.getTotalBlockReplicatedSize());
+      executeDefault(client);
+    }
+  }
+
+  private void executeDefault(ScmClient client) throws IOException {
+    DeletedBlocksTransactionSummary summary = client.getDeletedBlockSummary();
+    if (summary == null) {
+      System.out.println("DeletedBlocksTransaction summary is not available.");
+      return;
+    }
+    System.out.println("DeletedBlocksTransaction summary:");
+    System.out.println("Total number of transactions: " + 
summary.getTotalTransactionCount());
+    System.out.println("Total number of blocks: " + 
summary.getTotalBlockCount());
+    System.out.println("Total size of blocks: " + summary.getTotalBlockSize());
+    System.out.println("Total replicated size of blocks: " + 
summary.getTotalBlockReplicatedSize());
+  }
+
+  private void executeDetail(ScmClient client) throws IOException {
+    System.out.println("Taking RocksDB checkpoint and scanning deletedBlocks 
table ...");
+    RepairDeletedBlocksTxnSummaryFromCheckpointResponseProto resp =
+        client.getDeletedBlockSummaryFromCheckpoint();
+    printComparisonTable(resp);
+  }
+
+  private void executeRepair(ScmClient client) throws IOException {
+    System.out.println("Taking RocksDB checkpoint and scanning deletedBlocks 
table ...");
+    RepairDeletedBlocksTxnSummaryFromCheckpointResponseProto resp =
+        client.getDeletedBlockSummaryFromCheckpoint();
+    printComparisonTable(resp);
+
+    DeletedBlocksTransactionSummary actual =
+        resp.hasCheckpointActual() ? resp.getCheckpointActual() : null;
+    DeletedBlocksTransactionSummary persisted =
+        resp.hasCheckpointPersisted() ? resp.getCheckpointPersisted() : null;
+
+    if (!hasDiff(actual, persisted)) {
+      System.out.println("\nCheckpoint is consistent and nothing to repair.");
+      return;
+    }
+
+    System.out.println("\nDiff (Checkpoint-Actual minus 
Checkpoint-Persisted):");
+    printDiff(actual, persisted);
+
+    System.out.print("\nApply this diff to live in-memory counters? [y/N]: ");
+    System.out.flush();
+    String answer = new Scanner(System.in, 
StandardCharsets.UTF_8.name()).nextLine().trim();
+    if (!answer.equalsIgnoreCase("y") && !answer.equalsIgnoreCase("yes")) {
+      System.out.println("Aborted. No changes applied.");
+      return;

Review Comment:
   The confirmation prompt reads from `System.in` via `new 
Scanner(...).nextLine()` without handling EOF / closed stdin. In 
non-interactive environments this can throw and terminate the CLI; consider 
guarding for `NoSuchElementException`/`IllegalStateException` and treating it 
as a "no" response (or providing a `--yes`/`--no-prompt` flag).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to