Repository: reef
Updated Branches:
  refs/heads/master 1a7c6d1d3 -> 73c28280c


[REEF-1482] Driver does not exit even if all the task exit normally

Currently, when all the tasks and all the evaluators are completed,
sometimes driver still doen't shut down and hungs there forever.
This happens intermittently. When there are many nodes like 500 nodes
in IMRU runs in Yarn cluster, the issue can happen in every 2 or 3 runs.

The investigation shows there is a potential dead lock in ResourceManagerStatus.
This PR is to resolve this issue by reducing the scope of code under locks.

With the fixes, I have tested 10 times with 500 nodes in cluster, there is no 
repro any more.

JIRA:
  [REEF-1482](https://issues.apache.org/jira/browse/REEF-1482)

Pull request:
  This closes #1162


Project: http://git-wip-us.apache.org/repos/asf/reef/repo
Commit: http://git-wip-us.apache.org/repos/asf/reef/commit/73c28280
Tree: http://git-wip-us.apache.org/repos/asf/reef/tree/73c28280
Diff: http://git-wip-us.apache.org/repos/asf/reef/diff/73c28280

Branch: refs/heads/master
Commit: 73c28280c518996714a7bcacdd9e687baafaf35d
Parents: 1a7c6d1
Author: Julia Wang <[email protected]>
Authored: Thu Oct 20 19:22:18 2016 -0700
Committer: Mariia Mykhailova <[email protected]>
Committed: Mon Oct 31 11:01:27 2016 -0700

----------------------------------------------------------------------
 .../runtime/common/driver/idle/DriverIdleManager.java   |  2 ++
 .../driver/resourcemanager/ResourceManagerStatus.java   | 12 +++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/reef/blob/73c28280/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/idle/DriverIdleManager.java
----------------------------------------------------------------------
diff --git 
a/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/idle/DriverIdleManager.java
 
b/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/idle/DriverIdleManager.java
index 2792790..13019fd 100644
--- 
a/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/idle/DriverIdleManager.java
+++ 
b/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/idle/DriverIdleManager.java
@@ -77,6 +77,8 @@ public final class DriverIdleManager {
       isIdle &= idleMessage.isIdle();
     }
 
+    LOG.log(IDLE_REASONS_LEVEL, "onPotentiallyIdle: isIdle: " + isIdle);
+
     if (isIdle) {
       LOG.log(Level.INFO, "All components indicated idle. Initiating Driver 
shutdown.");
       driverStatusManagerImpl.onComplete();

http://git-wip-us.apache.org/repos/asf/reef/blob/73c28280/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/resourcemanager/ResourceManagerStatus.java
----------------------------------------------------------------------
diff --git 
a/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/resourcemanager/ResourceManagerStatus.java
 
b/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/resourcemanager/ResourceManagerStatus.java
index 4b19330..54e2450 100644
--- 
a/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/resourcemanager/ResourceManagerStatus.java
+++ 
b/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/resourcemanager/ResourceManagerStatus.java
@@ -71,16 +71,18 @@ public final class ResourceManagerStatus implements 
EventHandler<RuntimeStatusEv
   }
 
   @Override
-  public synchronized void onNext(final RuntimeStatusEvent runtimeStatusEvent) 
{
+  public void onNext(final RuntimeStatusEvent runtimeStatusEvent) {
 
     final State newState = runtimeStatusEvent.getState();
 
     LOG.log(Level.FINEST, "Runtime status: {0}", runtimeStatusEvent);
 
-    this.outstandingContainerRequests = 
runtimeStatusEvent.getOutstandingContainerRequests().orElse(0);
-    this.containerAllocationCount = 
runtimeStatusEvent.getContainerAllocationList().size();
+    synchronized(this) {
+      this.outstandingContainerRequests = 
runtimeStatusEvent.getOutstandingContainerRequests().orElse(0);
+      this.containerAllocationCount = 
runtimeStatusEvent.getContainerAllocationList().size();
 
-    this.setState(newState);
+      this.setState(newState);
+    }
 
     switch (newState) {
     case FAILED:
@@ -147,7 +149,7 @@ public final class ResourceManagerStatus implements 
EventHandler<RuntimeStatusEv
     this.driverStatusManager.onComplete();
   }
 
-  private synchronized void onRMRunning(final RuntimeStatusEvent 
runtimeStatusEvent) {
+  private void onRMRunning(final RuntimeStatusEvent runtimeStatusEvent) {
     assert runtimeStatusEvent.getState() == State.RUNNING;
     if (this.isIdle()) {
       this.driverIdleManager.get().onPotentiallyIdle(IDLE_MESSAGE);

Reply via email to