This is an automated email from the ASF dual-hosted git repository.

bharat pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 6752a71  HDDS-5317. BootStrapped SCM fails to bootstrap if it connects 
to another bootstrapped SCM first. (#2312)
6752a71 is described below

commit 6752a71eb0bf3083f2b2026a507e0320b8a7f39c
Author: Bharat Viswanadham <[email protected]>
AuthorDate: Thu Jun 10 09:43:01 2021 +0530

    HDDS-5317. BootStrapped SCM fails to bootstrap if it connects to another 
bootstrapped SCM first. (#2312)
---
 .../scm/ha/RetriableWithFailOverException.java     | 31 ++++++++++++++++++++++
 .../org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java  |  2 ++
 .../apache/hadoop/hdds/scm/ha/HASecurityUtils.java |  2 +-
 .../org/apache/hadoop/hdds/scm/ha/RatisUtil.java   | 22 +++++++++++++++
 4 files changed, 56 insertions(+), 1 deletion(-)

diff --git 
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/RetriableWithFailOverException.java
 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/RetriableWithFailOverException.java
new file mode 100644
index 0000000..4eaea72
--- /dev/null
+++ 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/RetriableWithFailOverException.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.ha;
+
+import java.io.IOException;
+
+/**
+ * This exception indicates that the request can be retried, and client need
+ * to retry on the next server.
+ */
+public class RetriableWithFailOverException extends IOException {
+  public RetriableWithFailOverException(IOException exception) {
+    super(exception);
+  }
+}
diff --git 
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
index 0d182c0..7220d53 100644
--- 
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
+++ 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAUtils.java
@@ -320,6 +320,8 @@ public final class SCMHAUtils {
     } else if (SCMHAUtils.checkNonRetriableException(e)) {
       return RetryPolicy.RetryAction.FAIL;
     } else {
+      // For any other exception like RetriableWithFailOverException or any
+      // other we perform fail-over and retry.
       if (failovers < maxRetryCount) {
         return new RetryPolicy.RetryAction(
             RetryPolicy.RetryAction.RetryDecision.FAILOVER_AND_RETRY,
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/HASecurityUtils.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/HASecurityUtils.java
index 1d2b36e..a314057 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/HASecurityUtils.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/HASecurityUtils.java
@@ -357,7 +357,7 @@ public final class HASecurityUtils {
         .setLeaderId(null)
         .setProperties(properties)
         .setRetryPolicy(
-            RetryPolicies.retryUpToMaximumCountWithFixedSleep(15,
+            RetryPolicies.retryUpToMaximumCountWithFixedSleep(120,
                 TimeDuration.valueOf(500, TimeUnit.MILLISECONDS)));
 
     if (tlsConfig != null) {
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/RatisUtil.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/RatisUtil.java
index 16394e6..63545dd 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/RatisUtil.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/RatisUtil.java
@@ -22,6 +22,7 @@ import com.google.protobuf.ServiceException;
 import org.apache.hadoop.hdds.conf.ConfigurationSource;
 import org.apache.hadoop.hdds.ratis.ServerNotLeaderException;
 import org.apache.hadoop.hdds.scm.ScmConfigKeys;
+import org.apache.hadoop.hdds.security.exception.SCMSecurityException;
 import org.apache.hadoop.hdds.server.ServerUtils;
 import org.apache.ratis.RaftConfigKeys;
 import org.apache.ratis.conf.RaftProperties;
@@ -37,6 +38,10 @@ import java.io.IOException;
 import java.util.Collections;
 import java.util.concurrent.TimeUnit;
 
+import static 
org.apache.hadoop.hdds.security.exception.SCMSecurityException.ErrorCode.GET_DN_CERTIFICATE_FAILED;
+import static 
org.apache.hadoop.hdds.security.exception.SCMSecurityException.ErrorCode.GET_OM_CERTIFICATE_FAILED;
+import static 
org.apache.hadoop.hdds.security.exception.SCMSecurityException.ErrorCode.GET_SCM_CERTIFICATE_FAILED;
+import static 
org.apache.hadoop.hdds.security.exception.SCMSecurityException.ErrorCode.NOT_A_PRIMARY_SCM;
 import static org.apache.ratis.server.RaftServerConfigKeys.Log;
 import static org.apache.ratis.server.RaftServerConfigKeys.RetryCache;
 import static org.apache.ratis.server.RaftServerConfigKeys.Rpc;
@@ -190,6 +195,23 @@ public final class RatisUtil {
       throw new ServiceException(ServerNotLeaderException
           .convertToNotLeaderException(nle,
               SCMRatisServerImpl.getSelfPeerId(scmId), port));
+    } else if (e instanceof SCMSecurityException) {
+      // For NOT_A_PRIMARY_SCM error client needs to retry on next SCM.
+      // GetSCMCertificate call can happen on non-leader SCM and only an
+      // primary SCM. When the bootstrapped SCM connects to other
+      // bootstrapped SCM we get the NOT_A_PRIMARY_SCM. In this scenario
+      // client needs to retry next SCM.
+
+      // And also on primary/leader SCM if it failed due to any other reason
+      // retry again.
+      SCMSecurityException ex = (SCMSecurityException) e;
+      if (ex.getErrorCode().equals(NOT_A_PRIMARY_SCM)) {
+        throw new ServiceException(new RetriableWithFailOverException(e));
+      } else if (ex.getErrorCode().equals(GET_SCM_CERTIFICATE_FAILED) ||
+          ex.getErrorCode().equals(GET_OM_CERTIFICATE_FAILED) ||
+          ex.getErrorCode().equals(GET_DN_CERTIFICATE_FAILED)) {
+        throw new ServiceException(new RetriableWithNoFailoverException(e));
+      }
     }
   }
 }

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to