This is an automated email from the ASF dual-hosted git repository.

adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 85a1e4d3ad HDDS-6634. Datanode should start from GetVersion state when 
re-registering to SCM (#5266)
85a1e4d3ad is described below

commit 85a1e4d3ad6f2e0088bf6124965fd131ba803a5b
Author: Sadanand Shenoy <[email protected]>
AuthorDate: Sat Sep 23 13:02:02 2023 +0530

    HDDS-6634. Datanode should start from GetVersion state when re-registering 
to SCM (#5266)
---
 .../states/endpoint/HeartbeatEndpointTask.java     |  6 +-
 .../scm/server/SCMDatanodeHeartbeatDispatcher.java |  2 +-
 .../ozone/scm/TestStorageContainerManager.java     | 88 ++++++++++++++++++++++
 3 files changed, 92 insertions(+), 4 deletions(-)

diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java
index 3a1bd8ffb3..24bf4190ec 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java
@@ -86,7 +86,7 @@ import static 
org.apache.hadoop.ozone.container.upgrade.UpgradeUtils.toLayoutVer
  */
 public class HeartbeatEndpointTask
     implements Callable<EndpointStateMachine.EndPointStates> {
-  static final Logger LOG =
+  public static final Logger LOG =
       LoggerFactory.getLogger(HeartbeatEndpointTask.class);
   private final EndpointStateMachine rpcEndpoint;
   private final ConfigurationSource conf;
@@ -450,9 +450,9 @@ public class HeartbeatEndpointTask
     if (rpcEndpoint.getState() == EndPointStates.HEARTBEAT) {
       if (LOG.isDebugEnabled()) {
         LOG.debug("Received SCM notification to register."
-            + " Interrupt HEARTBEAT and transit to REGISTER state.");
+            + " Interrupt HEARTBEAT and transit to GETVERSION state.");
       }
-      rpcEndpoint.setState(EndPointStates.REGISTER);
+      rpcEndpoint.setState(EndPointStates.GETVERSION);
     } else {
       if (LOG.isDebugEnabled()) {
         LOG.debug("Illegal state {} found, expecting {}.",
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java
index 6d9f42c49b..aaadbbbcb9 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java
@@ -72,7 +72,7 @@ import static 
org.apache.hadoop.ozone.container.upgrade.UpgradeUtils.toLayoutVer
  */
 public final class SCMDatanodeHeartbeatDispatcher {
 
-  private static final Logger LOG =
+  public static final Logger LOG =
       LoggerFactory.getLogger(SCMDatanodeHeartbeatDispatcher.class);
 
   private final NodeManager nodeManager;
diff --git 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestStorageContainerManager.java
 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestStorageContainerManager.java
index 2a43eefa00..a652f7327b 100644
--- 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestStorageContainerManager.java
+++ 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestStorageContainerManager.java
@@ -78,6 +78,11 @@ import org.apache.hadoop.ozone.OzoneConsts;
 import org.apache.hadoop.ozone.OzoneTestUtils;
 import org.apache.hadoop.ozone.container.ContainerTestHelper;
 import org.apache.hadoop.ozone.container.common.SCMTestUtils;
+import org.apache.hadoop.ozone.HddsDatanodeService;
+import 
org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine;
+import 
org.apache.hadoop.ozone.container.common.statemachine.EndpointStateMachine;
+import 
org.apache.hadoop.ozone.container.common.states.endpoint.HeartbeatEndpointTask;
+import 
org.apache.hadoop.ozone.container.common.states.endpoint.VersionEndpointTask;
 import org.apache.hadoop.ozone.om.helpers.OmKeyInfo;
 import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo;
 import org.apache.hadoop.ozone.protocol.commands.CloseContainerCommand;
@@ -87,7 +92,10 @@ import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
 import org.apache.hadoop.ozone.upgrade.LayoutVersionManager;
 import org.apache.hadoop.security.UserGroupInformation;
 import 
org.apache.hadoop.security.authentication.client.AuthenticationException;
+import org.apache.hadoop.util.ExitUtil;
 import org.apache.hadoop.util.Time;
+import org.apache.log4j.Level;
+import org.apache.log4j.LogManager;
 import org.apache.ozone.test.GenericTestUtils;
 import org.apache.ratis.conf.RaftProperties;
 import org.apache.ratis.protocol.RaftGroupId;
@@ -370,6 +378,86 @@ public class TestStorageContainerManager {
     }
   }
 
+  @Test
+  public void testOldDNRegistersToReInitialisedSCM() throws Exception {
+    OzoneConfiguration conf = new OzoneConfiguration();
+    MiniOzoneCluster cluster =
+        MiniOzoneCluster.newBuilder(conf).setHbInterval(1000)
+            .setHbProcessorInterval(3000).setNumDatanodes(1)
+            .setClusterId(UUID.randomUUID().toString()).build();
+    cluster.waitForClusterToBeReady();
+
+    try {
+      HddsDatanodeService datanode = cluster.getHddsDatanodes().get(0);
+      StorageContainerManager scm = cluster.getStorageContainerManager();
+      scm.stop();
+
+      // re-initialise SCM with new clusterID
+
+      GenericTestUtils.deleteDirectory(
+          new File(scm.getScmStorageConfig().getStorageDir()));
+      String newClusterId = UUID.randomUUID().toString();
+      StorageContainerManager.scmInit(scm.getConfiguration(), newClusterId);
+      scm = HddsTestUtils.getScmSimple(scm.getConfiguration());
+
+      DatanodeStateMachine dsm = datanode.getDatanodeStateMachine();
+      Assert.assertEquals(DatanodeStateMachine.DatanodeStates.RUNNING,
+          dsm.getContext().getState());
+      // DN Endpoint State has already gone through GetVersion and Register,
+      // so it will be in HEARTBEAT state.
+      for (EndpointStateMachine endpoint : dsm.getConnectionManager()
+          .getValues()) {
+        Assert.assertEquals(EndpointStateMachine.EndPointStates.HEARTBEAT,
+            endpoint.getState());
+      }
+      GenericTestUtils.LogCapturer scmDnHBDispatcherLog =
+          GenericTestUtils.LogCapturer.captureLogs(
+              SCMDatanodeHeartbeatDispatcher.LOG);
+      LogManager.getLogger(HeartbeatEndpointTask.class).setLevel(Level.DEBUG);
+      GenericTestUtils.LogCapturer heartbeatEndpointTaskLog =
+          GenericTestUtils.LogCapturer.captureLogs(HeartbeatEndpointTask.LOG);
+      GenericTestUtils.LogCapturer versionEndPointTaskLog =
+          GenericTestUtils.LogCapturer.captureLogs(VersionEndpointTask.LOG);
+      // Initially empty
+      Assert.assertTrue(scmDnHBDispatcherLog.getOutput().isEmpty());
+      Assert.assertTrue(versionEndPointTaskLog.getOutput().isEmpty());
+      // start the new SCM
+      scm.start();
+      // Initially DatanodeStateMachine will be in Running state
+      Assert.assertEquals(DatanodeStateMachine.DatanodeStates.RUNNING,
+          dsm.getContext().getState());
+      // DN heartbeats to new SCM, SCM doesn't recognize the node, sends the
+      // command to DN to re-register. Wait for SCM to send re-register command
+      String expectedLog = String.format(
+          "SCM received heartbeat from an unregistered datanode %s. "
+              + "Asking datanode to re-register.",
+          datanode.getDatanodeDetails());
+      GenericTestUtils.waitFor(
+          () -> scmDnHBDispatcherLog.getOutput().contains(expectedLog), 100,
+          5000);
+      ExitUtil.disableSystemExit();
+      // As part of processing response for re-register, DN 
EndpointStateMachine
+      // goes to GET-VERSION state which checks if there is already existing
+      // version file on the DN & if the clusterID matches with that of the SCM
+      // In this case, it won't match and gets 
InconsistentStorageStateException
+      // and DN shuts down.
+      String expectedLog2 = "Received SCM notification to register."
+          + " Interrupt HEARTBEAT and transit to GETVERSION state.";
+      GenericTestUtils.waitFor(
+          () -> heartbeatEndpointTaskLog.getOutput().contains(expectedLog2),
+          100, 5000);
+      GenericTestUtils.waitFor(() -> dsm.getContext().getShutdownOnError(), 
100,
+          5000);
+      Assert.assertEquals(DatanodeStateMachine.DatanodeStates.SHUTDOWN,
+          dsm.getContext().getState());
+      Assert.assertTrue(versionEndPointTaskLog.getOutput().contains(
+          "org.apache.hadoop.ozone.common" +
+              ".InconsistentStorageStateException: Mismatched ClusterIDs"));
+    } finally {
+      cluster.shutdown();
+    }
+  }
+
   @Test
   public void testBlockDeletingThrottling() throws Exception {
     int numKeys = 15;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to