errose28 commented on code in PR #7474:
URL: https://github.com/apache/ozone/pull/7474#discussion_r2037969853


##########
hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/checksum/TestContainerCommandReconciliation.java:
##########
@@ -247,26 +329,229 @@ public void testGetChecksumInfoSuccess() throws 
Exception {
     }
   }
 
-  private long writeDataAndGetContainer(boolean close) throws Exception {
-    String volumeName = UUID.randomUUID().toString();
-    String bucketName = UUID.randomUUID().toString();
+  @Test
+  public void testContainerChecksumWithBlockMissing() throws Exception {
+    // 1. Write data to a container.
+    // Read the key back and check its hash.
+    String volume = UUID.randomUUID().toString();
+    String bucket = UUID.randomUUID().toString();
+    Pair<Long, byte[]> containerAndData = getDataAndContainer(true, 20 * 1024 
* 1024, volume, bucket);
+    long containerID = containerAndData.getLeft();
+    byte[] data = containerAndData.getRight();
+    // Get the datanodes where the container replicas are stored.
+    List<DatanodeDetails> dataNodeDetails = 
cluster.getStorageContainerManager().getContainerManager()
+        .getContainerReplicas(ContainerID.valueOf(containerID))
+        .stream().map(ContainerReplica::getDatanodeDetails)
+        .collect(Collectors.toList());
+    Assertions.assertEquals(3, dataNodeDetails.size());
+    HddsDatanodeService hddsDatanodeService = 
cluster.getHddsDatanode(dataNodeDetails.get(0));
+    DatanodeStateMachine datanodeStateMachine = 
hddsDatanodeService.getDatanodeStateMachine();
+    Container<?> container = 
datanodeStateMachine.getContainer().getContainerSet().getContainer(containerID);
+    KeyValueContainerData containerData = (KeyValueContainerData) 
container.getContainerData();
+    ContainerProtos.ContainerChecksumInfo oldContainerChecksumInfo = 
readChecksumFile(container.getContainerData());
+    KeyValueHandler kvHandler = (KeyValueHandler) 
datanodeStateMachine.getContainer().getDispatcher()
+        .getHandler(ContainerProtos.ContainerType.KeyValueContainer);
+
+    BlockManager blockManager = kvHandler.getBlockManager();
+    List<BlockData> blockDataList = blockManager.listBlock(container, -1, 100);
+    String chunksPath = container.getContainerData().getChunksPath();
+    long oldDataChecksum = 
oldContainerChecksumInfo.getContainerMerkleTree().getDataChecksum();
+
+    // 2. Delete some blocks to simulate missing blocks.
+    try (DBHandle db = BlockUtils.getDB(containerData, conf);
+         BatchOperation op = 
db.getStore().getBatchHandler().initBatchOperation()) {
+      for (int i = 0; i < blockDataList.size(); i += 2) {
+        BlockData blockData = blockDataList.get(i);
+        // Delete the block metadata from the container db
+        db.getStore().getBlockDataTable().deleteWithBatch(op, 
containerData.getBlockKey(blockData.getLocalID()));
+        // Delete the block file.
+        Files.deleteIfExists(Paths.get(chunksPath + "/" + 
blockData.getBlockID().getLocalID() + ".block"));
+      }
+      db.getStore().getBatchHandler().commitBatchOperation(op);
+      db.getStore().flushDB();
+    }
+
+    // TODO: Use On-demand container scanner to build the new container merkle 
tree. (HDDS-10374)
+    
Files.deleteIfExists(getContainerChecksumFile(container.getContainerData()).toPath());
+    kvHandler.createContainerMerkleTree(container);
+    ContainerProtos.ContainerChecksumInfo containerChecksumAfterBlockDelete =
+        readChecksumFile(container.getContainerData());
+    long dataChecksumAfterBlockDelete = 
containerChecksumAfterBlockDelete.getContainerMerkleTree().getDataChecksum();
+    // Checksum should have changed after block delete.
+    Assertions.assertNotEquals(oldDataChecksum, dataChecksumAfterBlockDelete);
+
+    // 3. Reconcile the container.
+    StorageContainerLocationProtocolClientSideTranslatorPB scmClient = 
cluster.getStorageContainerLocationClient();
+    long lastHeartbeat = 
cluster.getStorageContainerManager().getScmNodeManager()
+        .getLastHeartbeat(dataNodeDetails.get(0));
+    scmClient.reconcileContainer(containerID);
+    GenericTestUtils.waitFor(() -> {
+      try {
+        ContainerProtos.ContainerChecksumInfo newContainerChecksumInfo = 
readChecksumFile(container.getContainerData());
+        long newHeartbeat = 
cluster.getStorageContainerManager().getScmNodeManager()
+            .getLastHeartbeat(dataNodeDetails.get(0));
+        return 
newContainerChecksumInfo.getContainerMerkleTree().getDataChecksum() == 
oldDataChecksum &&
+            newHeartbeat > lastHeartbeat;

Review Comment:
   This still isn't correct, because not every heartbeat will have container 
reports. The later checks may fail if they run before DN heartbeats the 
repaired container information. They may also pass incorrectly if SCM never saw 
the corrupted container information.
   
   To ensure the test is not flaky:
   1. Corrupt the container and get its resulting DN hash
   2. Wait for SCM to see that corrupted hash
   3. Reconcile
   4. Wait for SCM to see the non-corrupted hash



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to