This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 85a1e4d3ad HDDS-6634. Datanode should start from GetVersion state when
re-registering to SCM (#5266)
85a1e4d3ad is described below
commit 85a1e4d3ad6f2e0088bf6124965fd131ba803a5b
Author: Sadanand Shenoy <[email protected]>
AuthorDate: Sat Sep 23 13:02:02 2023 +0530
HDDS-6634. Datanode should start from GetVersion state when re-registering
to SCM (#5266)
---
.../states/endpoint/HeartbeatEndpointTask.java | 6 +-
.../scm/server/SCMDatanodeHeartbeatDispatcher.java | 2 +-
.../ozone/scm/TestStorageContainerManager.java | 88 ++++++++++++++++++++++
3 files changed, 92 insertions(+), 4 deletions(-)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java
index 3a1bd8ffb3..24bf4190ec 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java
@@ -86,7 +86,7 @@ import static
org.apache.hadoop.ozone.container.upgrade.UpgradeUtils.toLayoutVer
*/
public class HeartbeatEndpointTask
implements Callable<EndpointStateMachine.EndPointStates> {
- static final Logger LOG =
+ public static final Logger LOG =
LoggerFactory.getLogger(HeartbeatEndpointTask.class);
private final EndpointStateMachine rpcEndpoint;
private final ConfigurationSource conf;
@@ -450,9 +450,9 @@ public class HeartbeatEndpointTask
if (rpcEndpoint.getState() == EndPointStates.HEARTBEAT) {
if (LOG.isDebugEnabled()) {
LOG.debug("Received SCM notification to register."
- + " Interrupt HEARTBEAT and transit to REGISTER state.");
+ + " Interrupt HEARTBEAT and transit to GETVERSION state.");
}
- rpcEndpoint.setState(EndPointStates.REGISTER);
+ rpcEndpoint.setState(EndPointStates.GETVERSION);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("Illegal state {} found, expecting {}.",
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java
index 6d9f42c49b..aaadbbbcb9 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java
@@ -72,7 +72,7 @@ import static
org.apache.hadoop.ozone.container.upgrade.UpgradeUtils.toLayoutVer
*/
public final class SCMDatanodeHeartbeatDispatcher {
- private static final Logger LOG =
+ public static final Logger LOG =
LoggerFactory.getLogger(SCMDatanodeHeartbeatDispatcher.class);
private final NodeManager nodeManager;
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestStorageContainerManager.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestStorageContainerManager.java
index 2a43eefa00..a652f7327b 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestStorageContainerManager.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestStorageContainerManager.java
@@ -78,6 +78,11 @@ import org.apache.hadoop.ozone.OzoneConsts;
import org.apache.hadoop.ozone.OzoneTestUtils;
import org.apache.hadoop.ozone.container.ContainerTestHelper;
import org.apache.hadoop.ozone.container.common.SCMTestUtils;
+import org.apache.hadoop.ozone.HddsDatanodeService;
+import
org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine;
+import
org.apache.hadoop.ozone.container.common.statemachine.EndpointStateMachine;
+import
org.apache.hadoop.ozone.container.common.states.endpoint.HeartbeatEndpointTask;
+import
org.apache.hadoop.ozone.container.common.states.endpoint.VersionEndpointTask;
import org.apache.hadoop.ozone.om.helpers.OmKeyInfo;
import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo;
import org.apache.hadoop.ozone.protocol.commands.CloseContainerCommand;
@@ -87,7 +92,10 @@ import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
import org.apache.hadoop.ozone.upgrade.LayoutVersionManager;
import org.apache.hadoop.security.UserGroupInformation;
import
org.apache.hadoop.security.authentication.client.AuthenticationException;
+import org.apache.hadoop.util.ExitUtil;
import org.apache.hadoop.util.Time;
+import org.apache.log4j.Level;
+import org.apache.log4j.LogManager;
import org.apache.ozone.test.GenericTestUtils;
import org.apache.ratis.conf.RaftProperties;
import org.apache.ratis.protocol.RaftGroupId;
@@ -370,6 +378,86 @@ public class TestStorageContainerManager {
}
}
+ @Test
+ public void testOldDNRegistersToReInitialisedSCM() throws Exception {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ MiniOzoneCluster cluster =
+ MiniOzoneCluster.newBuilder(conf).setHbInterval(1000)
+ .setHbProcessorInterval(3000).setNumDatanodes(1)
+ .setClusterId(UUID.randomUUID().toString()).build();
+ cluster.waitForClusterToBeReady();
+
+ try {
+ HddsDatanodeService datanode = cluster.getHddsDatanodes().get(0);
+ StorageContainerManager scm = cluster.getStorageContainerManager();
+ scm.stop();
+
+ // re-initialise SCM with new clusterID
+
+ GenericTestUtils.deleteDirectory(
+ new File(scm.getScmStorageConfig().getStorageDir()));
+ String newClusterId = UUID.randomUUID().toString();
+ StorageContainerManager.scmInit(scm.getConfiguration(), newClusterId);
+ scm = HddsTestUtils.getScmSimple(scm.getConfiguration());
+
+ DatanodeStateMachine dsm = datanode.getDatanodeStateMachine();
+ Assert.assertEquals(DatanodeStateMachine.DatanodeStates.RUNNING,
+ dsm.getContext().getState());
+ // DN Endpoint State has already gone through GetVersion and Register,
+ // so it will be in HEARTBEAT state.
+ for (EndpointStateMachine endpoint : dsm.getConnectionManager()
+ .getValues()) {
+ Assert.assertEquals(EndpointStateMachine.EndPointStates.HEARTBEAT,
+ endpoint.getState());
+ }
+ GenericTestUtils.LogCapturer scmDnHBDispatcherLog =
+ GenericTestUtils.LogCapturer.captureLogs(
+ SCMDatanodeHeartbeatDispatcher.LOG);
+ LogManager.getLogger(HeartbeatEndpointTask.class).setLevel(Level.DEBUG);
+ GenericTestUtils.LogCapturer heartbeatEndpointTaskLog =
+ GenericTestUtils.LogCapturer.captureLogs(HeartbeatEndpointTask.LOG);
+ GenericTestUtils.LogCapturer versionEndPointTaskLog =
+ GenericTestUtils.LogCapturer.captureLogs(VersionEndpointTask.LOG);
+ // Initially empty
+ Assert.assertTrue(scmDnHBDispatcherLog.getOutput().isEmpty());
+ Assert.assertTrue(versionEndPointTaskLog.getOutput().isEmpty());
+ // start the new SCM
+ scm.start();
+ // Initially DatanodeStateMachine will be in Running state
+ Assert.assertEquals(DatanodeStateMachine.DatanodeStates.RUNNING,
+ dsm.getContext().getState());
+ // DN heartbeats to new SCM, SCM doesn't recognize the node, sends the
+ // command to DN to re-register. Wait for SCM to send re-register command
+ String expectedLog = String.format(
+ "SCM received heartbeat from an unregistered datanode %s. "
+ + "Asking datanode to re-register.",
+ datanode.getDatanodeDetails());
+ GenericTestUtils.waitFor(
+ () -> scmDnHBDispatcherLog.getOutput().contains(expectedLog), 100,
+ 5000);
+ ExitUtil.disableSystemExit();
+ // As part of processing response for re-register, DN
EndpointStateMachine
+ // goes to GET-VERSION state which checks if there is already existing
+ // version file on the DN & if the clusterID matches with that of the SCM
+ // In this case, it won't match and gets
InconsistentStorageStateException
+ // and DN shuts down.
+ String expectedLog2 = "Received SCM notification to register."
+ + " Interrupt HEARTBEAT and transit to GETVERSION state.";
+ GenericTestUtils.waitFor(
+ () -> heartbeatEndpointTaskLog.getOutput().contains(expectedLog2),
+ 100, 5000);
+ GenericTestUtils.waitFor(() -> dsm.getContext().getShutdownOnError(),
100,
+ 5000);
+ Assert.assertEquals(DatanodeStateMachine.DatanodeStates.SHUTDOWN,
+ dsm.getContext().getState());
+ Assert.assertTrue(versionEndPointTaskLog.getOutput().contains(
+ "org.apache.hadoop.ozone.common" +
+ ".InconsistentStorageStateException: Mismatched ClusterIDs"));
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
@Test
public void testBlockDeletingThrottling() throws Exception {
int numKeys = 15;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]