This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 8b93ea4a5a8 HDDS-14977. Intermittent failure in
TestDeadNodeHandler.testOnMessage (#10556)
8b93ea4a5a8 is described below
commit 8b93ea4a5a8d959d0375480e240faee096ebbabc
Author: Chi-Hsuan Huang <[email protected]>
AuthorDate: Sun Jun 21 14:34:35 2026 +0800
HDDS-14977. Intermittent failure in TestDeadNodeHandler.testOnMessage
(#10556)
---
.../apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java
index b52733cdc10..7b28de473ea 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDeadNodeHandler.java
@@ -73,7 +73,6 @@
import org.apache.hadoop.ozone.protocol.commands.DeleteBlocksCommand;
import
org.apache.hadoop.security.authentication.client.AuthenticationException;
import org.apache.ozone.test.LambdaTestUtils;
-import org.apache.ozone.test.tag.Flaky;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -100,6 +99,11 @@ public void setup() throws IOException,
AuthenticationException {
OzoneConfiguration conf = new OzoneConfiguration();
conf.setTimeDuration(HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT,
0, TimeUnit.SECONDS);
+ // The test drives node health transitions manually. Disable the periodic
+ // health check so it does not resurrect a node forced to DEAD (the node's
+ // heartbeat stays fresh), which would race with the handlers under test.
+ conf.setTimeDuration(ScmConfigKeys.OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL,
+ 1, TimeUnit.HOURS);
conf.setInt(ScmConfigKeys.OZONE_DATANODE_PIPELINE_LIMIT, 2);
conf.setStorageSize(OZONE_DATANODE_RATIS_VOLUME_FREE_SPACE_MIN,
10, StorageUnit.MB);
@@ -139,7 +143,6 @@ public void teardown() {
@Test
@SuppressWarnings("checkstyle:MethodLength")
- @Flaky("HDDS-14977")
public void testOnMessage(@TempDir File tempDir) throws Exception {
//GIVEN
DatanodeDetails datanode1 = MockDatanodeDetails.randomDatanodeDetails();
@@ -266,6 +269,12 @@ public void testOnMessage(@TempDir File tempDir) throws
Exception {
nodeManager.addDatanodeCommand(datanode1.getID(), cmd);
nodeManager.setNodeOperationalState(datanode1,
HddsProtos.NodeOperationalState.IN_SERVICE);
+ // Changing the operational state of a DEAD node fires a DEAD_NODE event on
+ // SCM's event queue. Let SCM's own DeadNodeHandler process it here, so its
+ // asynchronous topology removal does not race with the handlers driven
+ // below (it could otherwise remove the node right after
+ // HealthyReadOnlyNodeHandler re-adds it).
+ ((EventQueue) scm.getEventQueue()).processAll(60000L);
setNodeHealthState(datanode1, HddsProtos.NodeState.DEAD);
deadNodeHandler.onMessage(datanode1, publisher);
//datanode1 has been removed from ClusterNetworkTopology, another
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]