Repository: giraph
Updated Branches:
  refs/heads/trunk 096accfd0 -> beca35e45


GIRAPH-860: Giraph jobs can hang forever if HDFS filestamps aren't
created (aching)


Project: http://git-wip-us.apache.org/repos/asf/giraph/repo
Commit: http://git-wip-us.apache.org/repos/asf/giraph/commit/beca35e4
Tree: http://git-wip-us.apache.org/repos/asf/giraph/tree/beca35e4
Diff: http://git-wip-us.apache.org/repos/asf/giraph/diff/beca35e4

Branch: refs/heads/trunk
Commit: beca35e450c6103bf1020bdc1a13ee7b87624b5d
Parents: 096accf
Author: Avery Ching <[email protected]>
Authored: Mon Feb 24 17:01:50 2014 -0800
Committer: Avery Ching <[email protected]>
Committed: Tue Feb 25 10:58:39 2014 -0800

----------------------------------------------------------------------
 CHANGELOG                                       |  3 ++
 .../apache/giraph/conf/GiraphConfiguration.java | 15 +++++++
 .../org/apache/giraph/conf/GiraphConstants.java | 10 +++++
 .../org/apache/giraph/zk/ZooKeeperManager.java  | 45 ++++++++++++++++----
 4 files changed, 64 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/giraph/blob/beca35e4/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 738f801..7708e72 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,9 @@
 Giraph Change Log
 
 Release 1.1.0 - unreleased
+  GIRAPH-860: Giraph jobs can hang forever if HDFS filestamps aren't 
+  created (aching)
+
   GIRAPH-858: tests fail for hadoop_facebook because of dependency issues
   (pavanka via aching)
 

http://git-wip-us.apache.org/repos/asf/giraph/blob/beca35e4/giraph-core/src/main/java/org/apache/giraph/conf/GiraphConfiguration.java
----------------------------------------------------------------------
diff --git 
a/giraph-core/src/main/java/org/apache/giraph/conf/GiraphConfiguration.java 
b/giraph-core/src/main/java/org/apache/giraph/conf/GiraphConfiguration.java
index 358fe44..2862c3e 100644
--- a/giraph-core/src/main/java/org/apache/giraph/conf/GiraphConfiguration.java
+++ b/giraph-core/src/main/java/org/apache/giraph/conf/GiraphConfiguration.java
@@ -1223,4 +1223,19 @@ public class GiraphConfiguration extends Configuration
   public boolean trackJobProgressOnClient() {
     return TRACK_JOB_PROGRESS_ON_CLIENT.get(this);
   }
+
+  /**
+   * @return Number of retries when creating an HDFS file before failing.
+   */
+  public int getHdfsFileCreationRetries() {
+    return HDFS_FILE_CREATION_RETRIES.get(this);
+  }
+
+  /**
+   * @return Milliseconds to wait before retrying an HDFS file creation
+   *         operation.
+   */
+  public int getHdfsFileCreationRetryWaitMs() {
+    return HDFS_FILE_CREATION_RETRY_WAIT_MS.get(this);
+  }
 }

http://git-wip-us.apache.org/repos/asf/giraph/blob/beca35e4/giraph-core/src/main/java/org/apache/giraph/conf/GiraphConstants.java
----------------------------------------------------------------------
diff --git 
a/giraph-core/src/main/java/org/apache/giraph/conf/GiraphConstants.java 
b/giraph-core/src/main/java/org/apache/giraph/conf/GiraphConstants.java
index 8afe101..300005e 100644
--- a/giraph-core/src/main/java/org/apache/giraph/conf/GiraphConstants.java
+++ b/giraph-core/src/main/java/org/apache/giraph/conf/GiraphConstants.java
@@ -1031,5 +1031,15 @@ public interface GiraphConstants {
   BooleanConfOption TRACK_JOB_PROGRESS_ON_CLIENT =
       new BooleanConfOption("giraph.trackJobProgressOnClient", true,
           "Whether to track job progress on client or not");
+
+  /** Number of retries for creating the HDFS files */
+  IntConfOption HDFS_FILE_CREATION_RETRIES =
+      new IntConfOption("giraph.hdfs.file.creation.retries", 10,
+          "Retries to create an HDFS file before failing");
+
+  /** Number of milliseconds to wait before retrying HDFS file creation */
+  IntConfOption HDFS_FILE_CREATION_RETRY_WAIT_MS =
+      new IntConfOption("giraph.hdfs.file.creation.retry.wait.ms", 30_000,
+          "Milliseconds to wait prior to retrying creation of an HDFS file");
 }
 // CHECKSTYLE: resume InterfaceIsTypeCheck

http://git-wip-us.apache.org/repos/asf/giraph/blob/beca35e4/giraph-core/src/main/java/org/apache/giraph/zk/ZooKeeperManager.java
----------------------------------------------------------------------
diff --git 
a/giraph-core/src/main/java/org/apache/giraph/zk/ZooKeeperManager.java 
b/giraph-core/src/main/java/org/apache/giraph/zk/ZooKeeperManager.java
index 348580c..73ef97b 100644
--- a/giraph-core/src/main/java/org/apache/giraph/zk/ZooKeeperManager.java
+++ b/giraph-core/src/main/java/org/apache/giraph/zk/ZooKeeperManager.java
@@ -21,6 +21,7 @@ package org.apache.giraph.zk;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.io.Closeables;
+import com.google.common.util.concurrent.Uninterruptibles;
 import org.apache.commons.io.FileUtils;
 import org.apache.giraph.conf.GiraphConstants;
 import org.apache.giraph.conf.ImmutableClassesGiraphConfiguration;
@@ -52,6 +53,7 @@ import java.util.Arrays;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.TimeUnit;
 
 import static org.apache.giraph.conf.GiraphConstants.BASE_ZNODE_KEY;
 import static 
org.apache.giraph.conf.GiraphConstants.ZOOKEEPER_MANAGER_DIRECTORY;
@@ -310,7 +312,7 @@ public class ZooKeeperManager {
               "for base directory " + baseDirectory + ".  If there is an " +
               "issue with this directory, please set an accesible " +
               "base directory with the Hadoop configuration option " +
-              ZOOKEEPER_MANAGER_DIRECTORY.getKey());
+              ZOOKEEPER_MANAGER_DIRECTORY.getKey(), e);
     }
 
     Path myCandidacyPath = new Path(
@@ -329,19 +331,44 @@ public class ZooKeeperManager {
   }
 
   /**
+   * Create a new file with retries if it fails.
+   *
+   * @param fs File system where the new file is created
+   * @param path Path of the new file
+   * @param maxAttempts Maximum number of attempts
+   * @param retryWaitMsecs Milliseconds to wait before retrying
+   */
+  private static void createNewFileWithRetries(
+      FileSystem fs, Path path, int maxAttempts, int retryWaitMsecs) {
+    int attempt = 0;
+    while (attempt < maxAttempts) {
+      try {
+        fs.createNewFile(path);
+        return;
+      } catch (IOException e) {
+        LOG.warn("createNewFileWithRetries: Failed to create file at path " +
+            path + " on attempt " + attempt + " of " + maxAttempts + ".", e);
+      }
+      ++attempt;
+      Uninterruptibles.sleepUninterruptibly(
+          retryWaitMsecs, TimeUnit.MILLISECONDS);
+    }
+    throw new IllegalStateException(
+        "createNewFileWithRetries: Failed to create file at path " +
+            path + " after " + attempt + " attempts");
+  }
+
+  /**
    * Every task must create a stamp to let the ZooKeeper servers know that
    * they can shutdown.  This also lets the task know that it was already
    * completed.
    */
   private void createZooKeeperClosedStamp() {
-    try {
-      LOG.info("createZooKeeperClosedStamp: Creating my filestamp " +
-          myClosedPath);
-      fs.createNewFile(myClosedPath);
-    } catch (IOException e) {
-      LOG.error("createZooKeeperClosedStamp: Failed (maybe previous task " +
-          "failed) to create filestamp " + myClosedPath);
-    }
+    LOG.info("createZooKeeperClosedStamp: Creating my filestamp " +
+        myClosedPath);
+    createNewFileWithRetries(fs, myClosedPath,
+        conf.getHdfsFileCreationRetries(),
+        conf.getHdfsFileCreationRetryWaitMs());
   }
 
   /**

Reply via email to