This is an automated email from the ASF dual-hosted git repository.

adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 8b93ea4a5a8 HDDS-14977. Intermittent failure in 
TestDeadNodeHandler.testOnMessage (#10556)
8b93ea4a5a8 is described below

commit 8b93ea4a5a8d959d0375480e240faee096ebbabc
Author: Chi-Hsuan Huang <[email protected]>
AuthorDate: Sun Jun 21 14:34:35 2026 +0800

    HDDS-14977. Intermittent failure in TestDeadNodeHandler.testOnMessage 
(#10556)
---
 .../apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java    | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java
index b52733cdc10..7b28de473ea 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java
@@ -73,7 +73,6 @@
 import org.apache.hadoop.ozone.protocol.commands.DeleteBlocksCommand;
 import 
org.apache.hadoop.security.authentication.client.AuthenticationException;
 import org.apache.ozone.test.LambdaTestUtils;
-import org.apache.ozone.test.tag.Flaky;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@@ -100,6 +99,11 @@ public void setup() throws IOException, 
AuthenticationException {
     OzoneConfiguration conf = new OzoneConfiguration();
     
conf.setTimeDuration(HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT,
         0, TimeUnit.SECONDS);
+    // The test drives node health transitions manually. Disable the periodic
+    // health check so it does not resurrect a node forced to DEAD (the node's
+    // heartbeat stays fresh), which would race with the handlers under test.
+    conf.setTimeDuration(ScmConfigKeys.OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL,
+        1, TimeUnit.HOURS);
     conf.setInt(ScmConfigKeys.OZONE_DATANODE_PIPELINE_LIMIT, 2);
     conf.setStorageSize(OZONE_DATANODE_RATIS_VOLUME_FREE_SPACE_MIN,
         10, StorageUnit.MB);
@@ -139,7 +143,6 @@ public void teardown() {
 
   @Test
   @SuppressWarnings("checkstyle:MethodLength")
-  @Flaky("HDDS-14977")
   public void testOnMessage(@TempDir File tempDir) throws Exception {
     //GIVEN
     DatanodeDetails datanode1 = MockDatanodeDetails.randomDatanodeDetails();
@@ -266,6 +269,12 @@ public void testOnMessage(@TempDir File tempDir) throws 
Exception {
     nodeManager.addDatanodeCommand(datanode1.getID(), cmd);
     nodeManager.setNodeOperationalState(datanode1,
         HddsProtos.NodeOperationalState.IN_SERVICE);
+    // Changing the operational state of a DEAD node fires a DEAD_NODE event on
+    // SCM's event queue. Let SCM's own DeadNodeHandler process it here, so its
+    // asynchronous topology removal does not race with the handlers driven
+    // below (it could otherwise remove the node right after
+    // HealthyReadOnlyNodeHandler re-adds it).
+    ((EventQueue) scm.getEventQueue()).processAll(60000L);
     setNodeHealthState(datanode1, HddsProtos.NodeState.DEAD);
     deadNodeHandler.onMessage(datanode1, publisher);
     //datanode1 has been removed from ClusterNetworkTopology, another


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to