YARN-8362. Bugfix logic in container retries in node manager. Contributed by Chandni Singh
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/135941e0 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/135941e0 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/135941e0 Branch: refs/heads/HDDS-48 Commit: 135941e00d762a417c3b4cc524cdc59b0d1810b1 Parents: 2416906 Author: Eric Yang <ey...@apache.org> Authored: Tue May 29 16:56:58 2018 -0400 Committer: Eric Yang <ey...@apache.org> Committed: Tue May 29 16:56:58 2018 -0400 ---------------------------------------------------------------------- .../container/ContainerImpl.java | 4 +- .../container/SlidingWindowRetryPolicy.java | 62 +++++++++++--------- .../container/TestSlidingWindowRetryPolicy.java | 6 ++ 3 files changed, 44 insertions(+), 28 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/135941e0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index c09c7f1..5527ac4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -1602,8 +1602,10 @@ public class ContainerImpl implements Container { } container.addDiagnostics(exitEvent.getDiagnosticInfo() + "\n"); } - if (container.shouldRetry(container.exitCode)) { + // Updates to the retry context should be protected from concurrent + // writes. It should only be called from this transition. + container.retryPolicy.updateRetryContext(container.windowRetryContext); container.storeRetryContext(); doRelaunch(container, container.windowRetryContext.getRemainingRetries(), http://git-wip-us.apache.org/repos/asf/hadoop/blob/135941e0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java index 0208879..36a8b91 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java @@ -42,49 +42,40 @@ public class SlidingWindowRetryPolicy { public boolean shouldRetry(RetryContext retryContext, int errorCode) { - ContainerRetryContext containerRC = retryContext - .containerRetryContext; + ContainerRetryContext containerRC = retryContext.containerRetryContext; Preconditions.checkNotNull(containerRC, "container retry context null"); ContainerRetryPolicy retryPolicy = containerRC.getRetryPolicy(); if (retryPolicy == ContainerRetryPolicy.RETRY_ON_ALL_ERRORS || (retryPolicy == ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES && containerRC.getErrorCodes() != null && containerRC.getErrorCodes().contains(errorCode))) { - if (containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER) { - return true; - } - int pendingRetries = calculatePendingRetries(retryContext); - updateRetryContext(retryContext, pendingRetries); - return pendingRetries > 0; + return containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER + || calculateRemainingRetries(retryContext) > 0; } return false; } /** - * Calculates the pending number of retries. - * <p> - * When failuresValidityInterval is > 0, it also removes time entries from - * <code>restartTimes</code> which are outside the validity interval. + * Calculates the remaining number of retries. * - * @return the pending retries. + * @return the remaining retries. */ - private int calculatePendingRetries(RetryContext retryContext) { + private int calculateRemainingRetries(RetryContext retryContext) { ContainerRetryContext containerRC = retryContext.containerRetryContext; if (containerRC.getFailuresValidityInterval() > 0) { - Iterator<Long> iterator = retryContext.getRestartTimes().iterator(); + int validFailuresCount = 0; long currentTime = clock.getTime(); - while (iterator.hasNext()) { - long restartTime = iterator.next(); + for (int i = retryContext.restartTimes.size() - 1; i >= 0; i--) { + long restartTime = retryContext.restartTimes.get(i); if (currentTime - restartTime - > containerRC.getFailuresValidityInterval()) { - iterator.remove(); + <= containerRC.getFailuresValidityInterval()) { + validFailuresCount++; } else { break; } } - return containerRC.getMaxRetries() - - retryContext.getRestartTimes().size(); + return containerRC.getMaxRetries() - validFailuresCount; } else { return retryContext.getRemainingRetries(); } @@ -93,13 +84,30 @@ public class SlidingWindowRetryPolicy { /** * Updates remaining retries and the restart time when * required in the retryContext. + * <p> + * When failuresValidityInterval is > 0, it also removes time entries from + * <code>restartTimes</code> which are outside the validity interval. */ - private void updateRetryContext(RetryContext retryContext, - int pendingRetries) { - retryContext.setRemainingRetries(pendingRetries - 1); - if (retryContext.containerRetryContext.getFailuresValidityInterval() - > 0) { - retryContext.getRestartTimes().add(clock.getTime()); + protected void updateRetryContext(RetryContext retryContext) { + if (retryContext.containerRetryContext.getFailuresValidityInterval() > 0) { + ContainerRetryContext containerRC = retryContext.containerRetryContext; + Iterator<Long> iterator = retryContext.getRestartTimes().iterator(); + long currentTime = clock.getTime(); + + while (iterator.hasNext()) { + long restartTime = iterator.next(); + if (currentTime - restartTime + > containerRC.getFailuresValidityInterval()) { + iterator.remove(); + } else { + break; + } + } + retryContext.setRemainingRetries(containerRC.getMaxRetries() - + retryContext.restartTimes.size()); + retryContext.getRestartTimes().add(currentTime); + } else { + retryContext.remainingRetries--; } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/135941e0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java index 04889a9..bacf3bb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java @@ -64,12 +64,18 @@ public class TestSlidingWindowRetryPolicy { new SlidingWindowRetryPolicy.RetryContext(retryContext); Assert.assertTrue("retry 1", retryPolicy.shouldRetry(windowRetryContext, 12)); + retryPolicy.updateRetryContext(windowRetryContext); + clock.setTime(20); Assert.assertTrue("retry 2", retryPolicy.shouldRetry(windowRetryContext, 12)); + retryPolicy.updateRetryContext(windowRetryContext); + clock.setTime(40); Assert.assertTrue("retry 3", retryPolicy.shouldRetry(windowRetryContext, 12)); + retryPolicy.updateRetryContext(windowRetryContext); + clock.setTime(45); Assert.assertFalse("retry failed", retryPolicy.shouldRetry(windowRetryContext, 12)); --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org