This is an automated email from the ASF dual-hosted git repository.

bharat pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 70ef475  HDDS-5233. SCM subsequent init failed when previous scm init 
failed. (#2249)
70ef475 is described below

commit 70ef47534c440965da2d179cc7dd4557bc79cc5a
Author: Bharat Viswanadham <[email protected]>
AuthorDate: Fri May 21 09:27:06 2021 +0530

    HDDS-5233. SCM subsequent init failed when previous scm init failed. (#2249)
---
 .../hadoop/hdds/scm/ha/SCMRatisServerImpl.java     | 14 +++++++----
 .../apache/hadoop/hdds/scm/ha/SCMStateMachine.java | 19 +++++---------
 .../hdds/scm/server/StorageContainerManager.java   | 29 +++++++++++++++++-----
 .../hadoop/ozone/TestStorageContainerManager.java  | 25 +++++++++++++++++++
 4 files changed, 63 insertions(+), 24 deletions(-)

diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java
index ea563f2..063c007 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java
@@ -82,7 +82,6 @@ public class SCMRatisServerImpl implements SCMRatisServer {
       final StorageContainerManager scm, final SCMHADBTransactionBuffer buffer)
       throws IOException {
     this.scm = scm;
-    this.stateMachine = new SCMStateMachine(scm, this, buffer);
     final RaftGroupId groupId = buildRaftGroupId(scm.getClusterId());
     LOG.info("starting Raft server for scm:{}", scm.getScmId());
     // During SCM startup, the bootstrapped node will be started just with
@@ -100,9 +99,13 @@ public class SCMRatisServerImpl implements SCMRatisServer {
     Parameters parameters = createSCMServerTlsParameters(grpcTlsConfig);
 
     this.server = newRaftServer(scm.getScmId(), conf)
-        .setStateMachine(stateMachine)
+        .setStateMachineRegistry((gId) -> new SCMStateMachine(scm, buffer))
         .setGroup(RaftGroup.valueOf(groupId))
         .setParameters(parameters).build();
+
+    this.stateMachine =
+        (SCMStateMachine) server.getDivision(groupId).getStateMachine();
+
     this.division = server.getDivision(groupId);
   }
 
@@ -111,7 +114,9 @@ public class SCMRatisServerImpl implements SCMRatisServer {
     final RaftGroup group = buildRaftGroup(details, scmId, clusterId);
     RaftServer server = null;
     try {
-      server = newRaftServer(scmId, conf).setGroup(group).build();
+      server = newRaftServer(scmId, conf).setGroup(group)
+              .setStateMachineRegistry((groupId -> new SCMStateMachine()))
+              .build();
       server.start();
       waitForLeaderToBeReady(server, conf, group);
     } finally {
@@ -158,8 +163,7 @@ public class SCMRatisServerImpl implements SCMRatisServer {
     final RaftProperties serverProperties =
         RatisUtil.newRaftProperties(haConf, conf);
     return RaftServer.newBuilder().setServerId(RaftPeerId.getRaftPeerId(scmId))
-        .setProperties(serverProperties)
-        .setStateMachine(new SCMStateMachine());
+        .setProperties(serverProperties);
   }
 
   @Override
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
index 968b6f6..6fdbc5d 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
@@ -37,7 +37,6 @@ import org.apache.hadoop.hdds.utils.TransactionInfo;
 import org.apache.hadoop.hdds.utils.db.DBCheckpoint;
 import org.apache.hadoop.util.concurrent.HadoopExecutors;
 import org.apache.ratis.proto.RaftProtos;
-import org.apache.hadoop.hdds.scm.exceptions.SCMException;
 import org.apache.hadoop.util.Time;
 import org.apache.ratis.protocol.Message;
 import org.apache.ratis.protocol.RaftGroupId;
@@ -59,8 +58,6 @@ import org.apache.ratis.util.LifeCycle;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static 
org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes.SCM_NOT_INITIALIZED;
-
 /**
  * The SCMStateMachine is the state machine for SCMRatisServer. It is
  * responsible for applying ratis committed transactions to
@@ -84,20 +81,16 @@ public class SCMStateMachine extends BaseStateMachine {
   private DBCheckpoint installingDBCheckpoint = null;
 
   public SCMStateMachine(final StorageContainerManager scm,
-      final SCMRatisServer ratisServer, SCMHADBTransactionBuffer buffer)
-      throws SCMException {
+      SCMHADBTransactionBuffer buffer) {
     this.scm = scm;
     this.handlers = new EnumMap<>(RequestType.class);
     this.transactionBuffer = buffer;
     TransactionInfo latestTrxInfo = this.transactionBuffer.getLatestTrxInfo();
-    if (!latestTrxInfo.isDefault() &&
-        !updateLastAppliedTermIndex(latestTrxInfo.getTerm(),
-            latestTrxInfo.getTransactionIndex())) {
-      throw new SCMException(
-          String.format("Failed to update LastAppliedTermIndex " +
-                  "in StateMachine to term:{} index:{}",
-              latestTrxInfo.getTerm(), latestTrxInfo.getTransactionIndex()
-          ), SCM_NOT_INITIALIZED);
+    if (!latestTrxInfo.isDefault()) {
+      updateLastAppliedTermIndex(latestTrxInfo.getTerm(),
+          latestTrxInfo.getTransactionIndex());
+      LOG.info("Updated lastAppliedTermIndex {} with transactionInfo term and" 
+
+          "Index", latestTrxInfo);
     }
     this.installSnapshotExecutor = HadoopExecutors.newSingleThreadExecutor();
     isInitialized = true;
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
index fdfc4ec..09529d5 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
@@ -946,19 +946,36 @@ public final class StorageContainerManager extends 
ServiceRuntimeInfoImpl
           scmStorageConfig.setClusterId(clusterId);
         }
 
-        if (SCMHAUtils.isSCMHAEnabled(conf)) {
-          SCMRatisServerImpl.initialize(scmStorageConfig.getClusterID(),
-              scmStorageConfig.getScmId(), haDetails.getLocalNodeDetails(),
-              conf);
-          scmStorageConfig.setSCMHAFlag(true);
-        }
 
         if (OzoneSecurityUtil.isSecurityEnabled(conf)) {
           HASecurityUtils.initializeSecurity(scmStorageConfig, conf,
               getScmAddress(haDetails, conf), true);
         }
+
+        // Ensure scmRatisServer#initialize() is called post scm storage
+        // config initialization.. If SCM version file is created,
+        // the subsequent scm init should use the clusterID from version file.
+        // So, scmStorageConfig#initialize() should happen before ratis server
+        // initialize. In this way,we do not leave ratis storage directory
+        // with multiple raft group directories in failure scenario.
+
+        // The order of init should be
+        // 1. SCM storage config initialize to create version file.
+        // 2. Initialize Ratis server.
+
         scmStorageConfig.setPrimaryScmNodeId(scmStorageConfig.getScmId());
         scmStorageConfig.initialize();
+
+        if (SCMHAUtils.isSCMHAEnabled(conf)) {
+          SCMRatisServerImpl.initialize(scmStorageConfig.getClusterID(),
+              scmStorageConfig.getScmId(), haDetails.getLocalNodeDetails(),
+              conf);
+          scmStorageConfig = new SCMStorageConfig(conf);
+          scmStorageConfig.setSCMHAFlag(true);
+          // Do force initialize to persist SCM_HA flag.
+          scmStorageConfig.forceInitialize();
+        }
+
         LOG.info("SCM initialization succeeded. Current cluster id for sd={}"
                 + "; cid={}; layoutVersion={}; scmId={}",
             scmStorageConfig.getStorageDir(), scmStorageConfig.getClusterID(),
diff --git 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/TestStorageContainerManager.java
 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/TestStorageContainerManager.java
index 6670428..f91fc66 100644
--- 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/TestStorageContainerManager.java
+++ 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/TestStorageContainerManager.java
@@ -529,6 +529,31 @@ public class TestStorageContainerManager {
     }
   }
 
+
+  @Test
+  public void testSCMReinitializationWithHAUpgrade() throws Exception {
+    OzoneConfiguration conf = new OzoneConfiguration();
+    final String path = GenericTestUtils.getTempPath(
+        UUID.randomUUID().toString());
+    Path scmPath = Paths.get(path, "scm-meta");
+    conf.set(HddsConfigKeys.OZONE_METADATA_DIRS, scmPath.toString());
+    //This will set the cluster id in the version file
+    final UUID clusterId = UUID.randomUUID();
+      // This will initialize SCM
+
+    StorageContainerManager.scmInit(conf, clusterId.toString());
+    SCMStorageConfig scmStore = new SCMStorageConfig(conf);
+    Assert.assertEquals(clusterId.toString(), scmStore.getClusterID());
+    Assert.assertFalse(scmStore.isSCMHAEnabled());
+
+    conf.setBoolean(ScmConfigKeys.OZONE_SCM_HA_ENABLE_KEY, true);
+    StorageContainerManager.scmInit(conf, clusterId.toString());
+    scmStore = new SCMStorageConfig(conf);
+    Assert.assertTrue(scmStore.isSCMHAEnabled());
+    validateRatisGroupExists(conf, clusterId.toString());
+
+  }
+
   @VisibleForTesting
   public static void validateRatisGroupExists(OzoneConfiguration conf,
       String clusterId) throws IOException {

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to