Repository: reef Updated Branches: refs/heads/master 1a7c6d1d3 -> 73c28280c
[REEF-1482] Driver does not exit even if all the task exit normally Currently, when all the tasks and all the evaluators are completed, sometimes driver still doen't shut down and hungs there forever. This happens intermittently. When there are many nodes like 500 nodes in IMRU runs in Yarn cluster, the issue can happen in every 2 or 3 runs. The investigation shows there is a potential dead lock in ResourceManagerStatus. This PR is to resolve this issue by reducing the scope of code under locks. With the fixes, I have tested 10 times with 500 nodes in cluster, there is no repro any more. JIRA: [REEF-1482](https://issues.apache.org/jira/browse/REEF-1482) Pull request: This closes #1162 Project: http://git-wip-us.apache.org/repos/asf/reef/repo Commit: http://git-wip-us.apache.org/repos/asf/reef/commit/73c28280 Tree: http://git-wip-us.apache.org/repos/asf/reef/tree/73c28280 Diff: http://git-wip-us.apache.org/repos/asf/reef/diff/73c28280 Branch: refs/heads/master Commit: 73c28280c518996714a7bcacdd9e687baafaf35d Parents: 1a7c6d1 Author: Julia Wang <[email protected]> Authored: Thu Oct 20 19:22:18 2016 -0700 Committer: Mariia Mykhailova <[email protected]> Committed: Mon Oct 31 11:01:27 2016 -0700 ---------------------------------------------------------------------- .../runtime/common/driver/idle/DriverIdleManager.java | 2 ++ .../driver/resourcemanager/ResourceManagerStatus.java | 12 +++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/reef/blob/73c28280/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/idle/DriverIdleManager.java ---------------------------------------------------------------------- diff --git a/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/idle/DriverIdleManager.java b/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/idle/DriverIdleManager.java index 2792790..13019fd 100644 --- a/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/idle/DriverIdleManager.java +++ b/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/idle/DriverIdleManager.java @@ -77,6 +77,8 @@ public final class DriverIdleManager { isIdle &= idleMessage.isIdle(); } + LOG.log(IDLE_REASONS_LEVEL, "onPotentiallyIdle: isIdle: " + isIdle); + if (isIdle) { LOG.log(Level.INFO, "All components indicated idle. Initiating Driver shutdown."); driverStatusManagerImpl.onComplete(); http://git-wip-us.apache.org/repos/asf/reef/blob/73c28280/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/resourcemanager/ResourceManagerStatus.java ---------------------------------------------------------------------- diff --git a/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/resourcemanager/ResourceManagerStatus.java b/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/resourcemanager/ResourceManagerStatus.java index 4b19330..54e2450 100644 --- a/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/resourcemanager/ResourceManagerStatus.java +++ b/lang/java/reef-common/src/main/java/org/apache/reef/runtime/common/driver/resourcemanager/ResourceManagerStatus.java @@ -71,16 +71,18 @@ public final class ResourceManagerStatus implements EventHandler<RuntimeStatusEv } @Override - public synchronized void onNext(final RuntimeStatusEvent runtimeStatusEvent) { + public void onNext(final RuntimeStatusEvent runtimeStatusEvent) { final State newState = runtimeStatusEvent.getState(); LOG.log(Level.FINEST, "Runtime status: {0}", runtimeStatusEvent); - this.outstandingContainerRequests = runtimeStatusEvent.getOutstandingContainerRequests().orElse(0); - this.containerAllocationCount = runtimeStatusEvent.getContainerAllocationList().size(); + synchronized(this) { + this.outstandingContainerRequests = runtimeStatusEvent.getOutstandingContainerRequests().orElse(0); + this.containerAllocationCount = runtimeStatusEvent.getContainerAllocationList().size(); - this.setState(newState); + this.setState(newState); + } switch (newState) { case FAILED: @@ -147,7 +149,7 @@ public final class ResourceManagerStatus implements EventHandler<RuntimeStatusEv this.driverStatusManager.onComplete(); } - private synchronized void onRMRunning(final RuntimeStatusEvent runtimeStatusEvent) { + private void onRMRunning(final RuntimeStatusEvent runtimeStatusEvent) { assert runtimeStatusEvent.getState() == State.RUNNING; if (this.isIdle()) { this.driverIdleManager.get().onPotentiallyIdle(IDLE_MESSAGE);
