This is an automated email from the ASF dual-hosted git repository.

siyao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new ff9e37d1bf HDDS-9112. Make snapshot dir wait poll max timeout 
configurable (#5139)
ff9e37d1bf is described below

commit ff9e37d1bf5124ea2e1fb1c1edd468efee89acd9
Author: Swaminathan Balachandran <[email protected]>
AuthorDate: Thu Aug 3 11:09:01 2023 -0700

    HDDS-9112. Make snapshot dir wait poll max timeout configurable (#5139)
    
    Co-authored-by: Hemant Kumar <[email protected]>
    Co-authored-by: prashantpogde <[email protected]>
---
 .../common/src/main/resources/ozone-default.xml    |  9 +++++++
 .../hadoop/hdds/utils/db/RDBCheckpointUtils.java   | 30 +++++++++++++++-------
 .../org/apache/hadoop/ozone/om/OMConfigKeys.java   |  6 +++++
 .../hadoop/ozone/om/OmMetadataManagerImpl.java     | 11 +++++++-
 .../hadoop/ozone/om/snapshot/SnapshotUtils.java    |  6 +++--
 5 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml 
b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index d392fe97f6..aa1b092413 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -3368,6 +3368,15 @@
     <description>Time interval of the SST File filtering service from Snapshot.
     </description>
   </property>
+  <property>
+    <name>ozone.om.snapshot.checkpoint.dir.creation.poll.timeout</name>
+    <value>20s</value>
+    <tag>OZONE, PERFORMANCE, OM</tag>
+    <description>
+      Max poll timeout for snapshot dir exists check performed before loading 
a snapshot in cache.
+      Unit defaults to millisecond if a unit is not specified.
+    </description>
+  </property>
   <property>
     <name>ozone.sst.filtering.service.timeout</name>
     <value>300000ms</value>
diff --git 
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/RDBCheckpointUtils.java
 
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/RDBCheckpointUtils.java
index 24033680a7..ee550d7227 100644
--- 
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/RDBCheckpointUtils.java
+++ 
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/db/RDBCheckpointUtils.java
@@ -37,21 +37,22 @@ public final class RDBCheckpointUtils {
       LoggerFactory.getLogger(RDBCheckpointUtils.class);
   private static final Duration POLL_DELAY_DURATION = Duration.ZERO;
   private static final Duration POLL_INTERVAL_DURATION = 
Duration.ofMillis(100);
-  private static final Duration POLL_MAX_DURATION = Duration.ofSeconds(5);
+  private static final Duration POLL_MAX_DURATION = Duration.ofSeconds(20);
 
   private RDBCheckpointUtils() { }
 
   /**
-   * Wait for checkpoint directory to be created for 5 secs with 100 millis
-   * poll interval.
+   * Wait for checkpoint directory to be created for the given duration with
+   * 100 millis poll interval.
    * @param file Checkpoint directory.
-   * @return true if found.
+   * @param maxWaitTimeout wait at most before request timeout.
+   * @return true if found within given timeout else false.
    */
-  public static boolean waitForCheckpointDirectoryExist(File file)
-      throws IOException {
+  public static boolean waitForCheckpointDirectoryExist(File file,
+      Duration maxWaitTimeout) {
     Instant start = Instant.now();
     try {
-      with().atMost(POLL_MAX_DURATION)
+      with().atMost(maxWaitTimeout)
           .pollDelay(POLL_DELAY_DURATION)
           .pollInterval(POLL_INTERVAL_DURATION)
           .await()
@@ -62,9 +63,20 @@ public final class RDBCheckpointUtils {
           file.getAbsoluteFile());
       return true;
     } catch (ConditionTimeoutException exception) {
-      LOG.info("Checkpoint directory: {} didn't get created in 5 secs.",
-          file.getAbsolutePath());
+      LOG.info("Checkpoint directory: {} didn't get created in {} secs.",
+          maxWaitTimeout.getSeconds(), file.getAbsolutePath());
       return false;
     }
   }
+
+  /**
+   * Wait for checkpoint directory to be created for 5 secs with 100 millis
+   * poll interval.
+   * @param file Checkpoint directory.
+   * @return true if found.
+   */
+  public static boolean waitForCheckpointDirectoryExist(File file)
+      throws IOException {
+    return waitForCheckpointDirectoryExist(file, POLL_MAX_DURATION);
+  }
 }
diff --git 
a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java
 
b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java
index 259f7ab3ee..4a2369963d 100644
--- 
a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java
+++ 
b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java
@@ -383,6 +383,12 @@ public final class OMConfigKeys {
   public static final String
       OZONE_SNAPSHOT_SST_FILTERING_SERVICE_INTERVAL_DEFAULT = "60s";
 
+  public static final String
+      OZONE_SNAPSHOT_CHECKPOINT_DIR_CREATION_POLL_TIMEOUT =
+      "ozone.om.snapshot.checkpoint.dir.creation.poll.timeout";
+
+  public static final String
+      OZONE_SNAPSHOT_CHECKPOINT_DIR_CREATION_POLL_TIMEOUT_DEFAULT = "20s";
 
   public static final String OZONE_OM_GRPC_MAXIMUM_RESPONSE_LENGTH =
       "ozone.om.grpc.maximum.response.length";
diff --git 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java
 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java
index 86f3725408..748c9094d7 100644
--- 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java
+++ 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java
@@ -33,6 +33,7 @@ import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.UUID;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
@@ -97,6 +98,8 @@ import static org.apache.hadoop.ozone.OzoneConsts.OM_DB_NAME;
 import static org.apache.hadoop.ozone.OzoneConsts.OM_KEY_PREFIX;
 import static 
org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_FS_SNAPSHOT_MAX_LIMIT;
 import static 
org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_FS_SNAPSHOT_MAX_LIMIT_DEFAULT;
+import static 
org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_SNAPSHOT_CHECKPOINT_DIR_CREATION_POLL_TIMEOUT;
+import static 
org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_SNAPSHOT_CHECKPOINT_DIR_CREATION_POLL_TIMEOUT_DEFAULT;
 import static org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshotPrefix;
 import static 
org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.BUCKET_NOT_FOUND;
 import static 
org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.VOLUME_NOT_FOUND;
@@ -377,13 +380,19 @@ public class OmMetadataManagerImpl implements 
OMMetadataManager,
           OM_KEY_PREFIX + OM_SNAPSHOT_CHECKPOINT_DIR;
       File metaDir = new File(snapshotDir);
       String dbName = OM_DB_NAME + snapshotDirName;
+      Duration maxPollDuration =
+          Duration.ofMillis(conf.getTimeDuration(
+              OZONE_SNAPSHOT_CHECKPOINT_DIR_CREATION_POLL_TIMEOUT,
+              OZONE_SNAPSHOT_CHECKPOINT_DIR_CREATION_POLL_TIMEOUT_DEFAULT,
+              TimeUnit.MILLISECONDS));
       // The check is only to prevent every snapshot read to perform a disk IO
       // and check if a checkpoint dir exists. If entry is present in cache,
       // it is most likely DB entries will get flushed in this wait time.
       if (isSnapshotInCache) {
         File checkpoint =
             Paths.get(metaDir.toPath().toString(), dbName).toFile();
-        RDBCheckpointUtils.waitForCheckpointDirectoryExist(checkpoint);
+        RDBCheckpointUtils.waitForCheckpointDirectoryExist(checkpoint,
+            maxPollDuration);
         // Check if the snapshot directory exists.
         checkSnapshotDirExist(checkpoint);
       }
diff --git 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java
 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java
index de6ab46361..ef7e4e895e 100644
--- 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java
+++ 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java
@@ -36,6 +36,7 @@ import java.util.UUID;
 
 import static 
org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.FILE_NOT_FOUND;
 import static 
org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.KEY_NOT_FOUND;
+import static 
org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.TIMEOUT;
 
 /**
  * Util class for snapshot diff APIs.
@@ -93,7 +94,7 @@ public final class SnapshotUtils {
   }
 
   /**
-   * Throws OMException FILE_NOT_FOUND if snapshot directory does not exist.
+   * Throws OMException TIMEOUT if snapshot directory does not exist.
    * @param checkpoint Snapshot checkpoint directory
    */
   public static void checkSnapshotDirExist(File checkpoint)
@@ -101,7 +102,8 @@ public final class SnapshotUtils {
     if (!checkpoint.exists()) {
       throw new OMException("Unable to load snapshot. " +
           "Snapshot checkpoint directory '" + checkpoint.getAbsolutePath() +
-          "' does not exists.", FILE_NOT_FOUND);
+          "' does not exist yet. Please wait a few more seconds before " +
+          "retrying", TIMEOUT);
     }
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to