This is an automated email from the ASF dual-hosted git repository.

msingh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 08375d7  HDDS-5246. Wait for ever to obtain CA list which is needed 
during OM/DN startup (#2266)
08375d7 is described below

commit 08375d7bc3911ca019aef1e8202708d4f74f864d
Author: Bharat Viswanadham <[email protected]>
AuthorDate: Thu May 20 09:20:29 2021 +0530

    HDDS-5246. Wait for ever to obtain CA list which is needed during OM/DN 
startup (#2266)
---
 .../org/apache/hadoop/hdds/scm/ScmConfigKeys.java  |  4 ++
 .../common/src/main/resources/ozone-default.xml    | 14 ++++
 .../java/org/apache/hadoop/hdds/utils/HAUtils.java | 83 +++++++++++-----------
 3 files changed, 59 insertions(+), 42 deletions(-)

diff --git 
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
index d6ba48d..024953d 100644
--- 
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
+++ 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
@@ -524,6 +524,10 @@ public final class ScmConfigKeys {
   public static final long OZONE_SCM_INFO_WAIT_DURATION_DEFAULT =
       10 * 60;
 
+  public static final String OZONE_SCM_CA_LIST_RETRY_INTERVAL =
+      "ozone.scm.ca.list.retry.interval";
+  public static final long OZONE_SCM_CA_LIST_RETRY_INTERVAL_DEFAULT = 10;
+
   /**
    * Never constructed.
    */
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml 
b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index d64a212..2014aea 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -2841,4 +2841,18 @@
     </description>
   </property>
 
+
+  <property>
+    <name>ozone.scm.ca.list.retry.interval</name>
+    <tag>OZONE, SCM, OM, DATANODE</tag>
+    <value>10s</value>
+    <description>SCM client wait duration between each retry to get Scm CA
+      list. OM/Datanode obtain CA list during startup, and wait
+      for the CA List size to be matched with SCM node count size plus
+      1. (Additional one certificate is root CA certificate). If the received
+      CA list size is not matching with expected count, this is the duration
+      used to wait before making next attempt to get CA list.
+    </description>
+  </property>
+
 </configuration>
diff --git 
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java 
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
index f2cbc4e..4929946 100644
--- 
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
+++ 
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
@@ -43,9 +43,10 @@ import org.apache.hadoop.hdds.utils.db.DBStore;
 import org.apache.hadoop.hdds.utils.db.RocksDBConfiguration;
 import org.apache.hadoop.hdds.utils.db.Table;
 import org.apache.hadoop.hdds.utils.db.DBStoreBuilder;
+import org.apache.hadoop.io.retry.RetryPolicies;
+import org.apache.hadoop.io.retry.RetryPolicy;
 import org.apache.hadoop.ozone.OzoneSecurityUtil;
 import org.apache.hadoop.security.UserGroupInformation;
-import org.apache.hadoop.util.Time;
 import org.apache.ratis.util.ExitUtils;
 import org.apache.ratis.util.FileUtils;
 import org.slf4j.Logger;
@@ -61,8 +62,11 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
+import java.util.concurrent.Callable;
 import java.util.concurrent.TimeUnit;
 
+import static 
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_CA_LIST_RETRY_INTERVAL;
+import static 
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_CA_LIST_RETRY_INTERVAL_DEFAULT;
 import static 
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_INFO_WAIT_DURATION;
 import static 
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_INFO_WAIT_DURATION_DEFAULT;
 import static org.apache.hadoop.hdds.server.ServerUtils.getOzoneMetaDirPath;
@@ -355,8 +359,10 @@ public final class HAUtils {
    */
   public static List<String> buildCAList(CertificateClient certClient,
       ConfigurationSource configuration) throws IOException {
-    //TODO: make it configurable.
     List<String> caCertPemList;
+    long waitDuration =
+        configuration.getTimeDuration(OZONE_SCM_CA_LIST_RETRY_INTERVAL,
+            OZONE_SCM_CA_LIST_RETRY_INTERVAL_DEFAULT, TimeUnit.SECONDS);
     if (certClient != null) {
       caCertPemList = new ArrayList<>();
       if (!SCMHAUtils.isSCMHAEnabled(configuration)) {
@@ -377,9 +383,9 @@ public final class HAUtils {
           if (caCertPemList != null && caCertPemList.size() == expectedCount) {
             return caCertPemList;
           }
-          caCertPemList = waitForCACerts(() -> certClient.updateCAList(),
-              expectedCount);
-          checkCertCount(caCertPemList.size(), expectedCount);
+          caCertPemList = getCAListWithRetry(() ->
+              waitForCACerts(certClient::updateCAList, expectedCount),
+              waitDuration);
         } else {
           caCertPemList = certClient.listCA();
         }
@@ -401,10 +407,9 @@ public final class HAUtils {
         Collection<String> scmNodes = SCMHAUtils.getSCMNodeIds(configuration);
         int expectedCount = scmNodes.size() + 1;
         if (scmNodes.size() > 1) {
-          caCertPemList = waitForCACerts(
-              () -> scmSecurityProtocolClient.listCACertificate(),
-              expectedCount);
-          checkCertCount(caCertPemList.size(), expectedCount);
+          caCertPemList = getCAListWithRetry(() -> waitForCACerts(
+              scmSecurityProtocolClient::listCACertificate,
+              expectedCount), waitDuration);
         } else{
           caCertPemList = scmSecurityProtocolClient.listCACertificate();
         }
@@ -413,47 +418,41 @@ public final class HAUtils {
     return caCertPemList;
   }
 
+  /**
+   * Retry for ever until CA list matches expected count.
+   * @param task - task to get CA list.
+   * @return CA list.
+   */
+  private static List<String> getCAListWithRetry(Callable<List<String>> task,
+      long waitDuration) throws IOException {
+    RetryPolicy retryPolicy = RetryPolicies.retryForeverWithFixedSleep(
+        waitDuration, TimeUnit.SECONDS);
+    RetriableTask<List<String>> retriableTask =
+        new RetriableTask<>(retryPolicy, "getCAList", task);
+    try {
+      return retriableTask.call();
+    } catch (Exception ex) {
+      throw new SCMSecurityException("Unable to obtain complete CA " +
+          "list", ex);
+    }
+  }
+
   private static List<String> waitForCACerts(
       final SupplierWithIOException<List<String>> applyFunction,
       int expectedCount) throws IOException {
-    //TODO: make wait time and sleep time configurable if needed.
     // TODO: If SCMs are bootstrapped later, then listCA need to be
     //  refetched if listCA size is less than scm ha config node list size.
     // For now when Client of SCM's are started we compare their node list
     // size and ca list size if it is as expected, we return the ca list.
-    boolean caListUpToDate;
-    long waitTime = 5 * 60 * 1000L;
-    long retryTime = 10 * 1000L;
-    long currentTime = Time.monotonicNow();
-    List<String> caCertPemList;
-    do {
-      caCertPemList = applyFunction.get();
-      caListUpToDate =
-          caCertPemList.size() == expectedCount ? true : false;
-      if (!caListUpToDate) {
-        LOG.info("Expected CA list size {}, where as received CA List size " +
-            "{}. Retry to fetch CA List after {} seconds", expectedCount,
-            caCertPemList.size(), retryTime);
-        try {
-          Thread.sleep(retryTime);
-        } catch (InterruptedException ex) {
-          Thread.currentThread().interrupt();
-        }
-      }
-    } while (!caListUpToDate &&
-        Time.monotonicNow() - currentTime < waitTime);
-    return caCertPemList;
-  }
-
-
-  private static void checkCertCount(int certCount, int expectedCount)
-      throws SCMSecurityException{
-    if (certCount != expectedCount) {
-      LOG.error("Unable to obtain CA list for SCM cluster, obtained CA list " +
-              "size is {}, where as expected list size is {}",
-          certCount, expectedCount);
-      throw new SCMSecurityException("Unable to obtain complete CA list");
+    List<String> caCertPemList = applyFunction.get();
+    boolean caListUpToDate = caCertPemList.size() == expectedCount;
+    if (!caListUpToDate) {
+      LOG.info("Expected CA list size {}, where as received CA List size " +
+          "{}.", expectedCount, caCertPemList.size());
+      throw new SCMSecurityException("Expected CA list size " + expectedCount
+          + " is not matching actual count " + caCertPemList.size());
     }
+    return caCertPemList;
   }
 
   /**

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to