Repository: reef Updated Branches: refs/heads/master 9f98435e0 -> 1e1fff6f2
[REEF-1677] Count evaluators failed during WaitingForEvaluator phase towards MaximumNumberOfEvaluatorFailures limit Previously evaluators which failed during WaitingForEvaluator phase were not counted. This caused long wait times for IMRU jobs which had a lot of failures and should have failed sooner. JIRA: [REEF-1677](https://issues.apache.org/jira/browse/REEF-1677) Pull request: This closes #1192 Project: http://git-wip-us.apache.org/repos/asf/reef/repo Commit: http://git-wip-us.apache.org/repos/asf/reef/commit/1e1fff6f Tree: http://git-wip-us.apache.org/repos/asf/reef/tree/1e1fff6f Diff: http://git-wip-us.apache.org/repos/asf/reef/diff/1e1fff6f Branch: refs/heads/master Commit: 1e1fff6f2e2873e39a7338bb1370148ed17a2401 Parents: 9f98435 Author: Mariia Mykhailova <[email protected]> Authored: Mon Nov 28 16:44:22 2016 -0800 Committer: Julia Wang <[email protected]> Committed: Wed Nov 30 17:23:22 2016 -0800 ---------------------------------------------------------------------- .../OnREEF/Driver/EvaluatorManager.cs | 37 +++++++------------- .../OnREEF/Driver/IMRUDriver.cs | 1 - 2 files changed, 12 insertions(+), 26 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/reef/blob/1e1fff6f/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs ---------------------------------------------------------------------- diff --git a/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs b/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs index fdbb463..27a0f3e 100644 --- a/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs +++ b/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/EvaluatorManager.cs @@ -36,13 +36,15 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver private static readonly Logger Logger = Logger.GetLogger(typeof(EvaluatorManager)); private readonly ISet<string> _allocatedEvaluatorIds = new HashSet<string>(); - private readonly ISet<string> _failedEvaluatorIds = new HashSet<string>(); private readonly int _totalExpectedEvaluators; private readonly int _allowedNumberOfEvaluatorFailures; private readonly IEvaluatorRequestor _evaluatorRequestor; private string _masterEvaluatorId; + private int _failedEvaluatorsCount; + private bool _masterEvaluatorFailed; + private readonly EvaluatorSpecification _updateEvaluatorSpecification; private readonly EvaluatorSpecification _mapperEvaluatorSpecification; @@ -199,27 +201,11 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver internal void RecordFailedEvaluator(string evaluatorId) { RemoveAllocatedEvaluator(evaluatorId); - - if (_failedEvaluatorIds.Contains(evaluatorId)) - { - string msg = string.Format("The failed evaluator {0} has been recorded.", evaluatorId); - Exceptions.Throw(new IMRUSystemException(msg), Logger); - } - _failedEvaluatorIds.Add(evaluatorId); - } - - /// <summary> - /// Remove failed evaluator from the collection - /// </summary> - /// <param name="evaluatorId"></param> - internal void RemoveFailedEvaluator(string evaluatorId) - { - if (!_failedEvaluatorIds.Contains(evaluatorId)) + if (_masterEvaluatorId != null && _masterEvaluatorId.Equals(evaluatorId)) { - string msg = string.Format("The failed evaluator {0} is not recorded in list of failed evaluators.", evaluatorId); - Exceptions.Throw(new IMRUSystemException(msg), Logger); + _masterEvaluatorFailed = true; } - _failedEvaluatorIds.Remove(evaluatorId); + _failedEvaluatorsCount++; } /// <summary> @@ -227,7 +213,7 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver /// </summary> internal bool ExceededMaximumNumberOfEvaluatorFailures() { - return _failedEvaluatorIds.Count > AllowedNumberOfEvaluatorFailures; + return _failedEvaluatorsCount > AllowedNumberOfEvaluatorFailures; } /// <summary> @@ -247,7 +233,7 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver { ResetMasterEvaluatorId(); } - _failedEvaluatorIds.Clear(); + _failedEvaluatorsCount = 0; } /// <summary> @@ -281,6 +267,7 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver Exceptions.Throw(new IMRUSystemException("Master evaluator is already null"), Logger); } _masterEvaluatorId = null; + _masterEvaluatorFailed = false; } /// <summary> @@ -312,7 +299,7 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver /// <returns></returns> internal bool IsMasterEvaluatorFailed() { - return _masterEvaluatorId != null && _failedEvaluatorIds.Contains(_masterEvaluatorId); + return _masterEvaluatorFailed; } /// <summary> @@ -323,9 +310,9 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver { if (IsMasterEvaluatorFailed()) { - return _failedEvaluatorIds.Count - 1; + return _failedEvaluatorsCount - 1; } - return _failedEvaluatorIds.Count; + return _failedEvaluatorsCount; } /// <summary> http://git-wip-us.apache.org/repos/asf/reef/blob/1e1fff6f/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs ---------------------------------------------------------------------- diff --git a/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs b/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs index c82dd2d..52e7c6a 100644 --- a/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs +++ b/lang/cs/Org.Apache.REEF.IMRU/OnREEF/Driver/IMRUDriver.cs @@ -528,7 +528,6 @@ namespace Org.Apache.REEF.IMRU.OnREEF.Driver _serviceAndContextConfigurationProvider.RemoveEvaluatorIdFromPartitionIdProvider( failedEvaluator.Id); Logger.Log(Level.Info, "Requesting mapper Evaluators."); - _evaluatorManager.RemoveFailedEvaluator(failedEvaluator.Id); _evaluatorManager.RequestMapEvaluators(1); } else
