Repository: ambari
Updated Branches:
  refs/heads/trunk 165ec700f -> 4690fe7c6


AMBARI-20593. EU/RU Auto-Retry does not reschedule task when host is not 
heartbeating before task is scheduled and doesn't have a start time (alejandro)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/4690fe7c
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/4690fe7c
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/4690fe7c

Branch: refs/heads/trunk
Commit: 4690fe7c655ade9af91cfea7189e2efd98885149
Parents: 165ec70
Author: Alejandro Fernandez <afernan...@hortonworks.com>
Authored: Mon Mar 27 19:14:13 2017 -0700
Committer: Alejandro Fernandez <afernan...@hortonworks.com>
Committed: Tue Mar 28 10:15:44 2017 -0700

----------------------------------------------------------------------
 .../server/actionmanager/HostRoleCommand.java   |  6 +-
 .../services/RetryUpgradeActionService.java     | 60 ++++++++++++++++----
 .../services/RetryUpgradeActionServiceTest.java | 28 +++++++--
 3 files changed, 74 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/4690fe7c/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java
 
b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java
index 85c8e9f..651eb24 100644
--- 
a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java
+++ 
b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java
@@ -168,10 +168,10 @@ public class HostRoleCommand {
     errorLog = hostRoleCommandEntity.getErrorLog();
     structuredOut = hostRoleCommandEntity.getStructuredOut() != null ? new 
String(hostRoleCommandEntity.getStructuredOut()) : "";
     exitCode = hostRoleCommandEntity.getExitcode();
-    startTime = hostRoleCommandEntity.getStartTime();
-    originalStartTime = hostRoleCommandEntity.getOriginalStartTime();
+    startTime = hostRoleCommandEntity.getStartTime() != null ? 
hostRoleCommandEntity.getStartTime() : -1L;
+    originalStartTime = hostRoleCommandEntity.getOriginalStartTime() != null ? 
hostRoleCommandEntity.getOriginalStartTime() : -1L;
     endTime = hostRoleCommandEntity.getEndTime() != null ? 
hostRoleCommandEntity.getEndTime() : -1L;
-    lastAttemptTime = hostRoleCommandEntity.getLastAttemptTime();
+    lastAttemptTime = hostRoleCommandEntity.getLastAttemptTime() != null ? 
hostRoleCommandEntity.getLastAttemptTime() : -1L;
     attemptCount = hostRoleCommandEntity.getAttemptCount();
     retryAllowed = hostRoleCommandEntity.isRetryAllowed();
     autoSkipFailure = hostRoleCommandEntity.isFailureAutoSkipped();

http://git-wip-us.apache.org/repos/asf/ambari/blob/4690fe7c/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java
 
b/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java
index a92aa04..6d960c3 100644
--- 
a/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java
+++ 
b/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java
@@ -199,24 +199,59 @@ public class RetryUpgradeActionService extends 
AbstractScheduledService {
     List<HostRoleCommandEntity> holdingCommands = 
m_hostRoleCommandDAO.findByRequestIdAndStatuses(requestId, HOLDING_STATUSES);
     if (holdingCommands.size() > 0) {
       for (HostRoleCommandEntity hrc : holdingCommands) {
-        LOG.debug("Comparing taskId: {}, original start time: {}, now: {}",
-            hrc.getTaskId(), hrc.getOriginalStartTime(), now);
+        LOG.debug("Comparing taskId: {}, attempt count: {}, original start 
time: {}, now: {}",
+            hrc.getTaskId(), hrc.getAttemptCount(), 
hrc.getOriginalStartTime(), now);
 
         /*
+        Use-Case 1:
+        If the command has been sent to the host before because it was 
heartbeating, then it does have
+        an original start time, so we can attempt to retry on this host even 
if no longer heartbeating.
+        If the host does heartbeat again within the time interval, the command 
will actually be scheduled by the host.
+
+        Use-Case 2:
+        If the host is not heartbeating and the command is scheduled to be ran 
on it, then it means the following
+        is true,
+        - does not have original start time
+        - does not have start time
+        - attempt count is 0
+        - status will be HOLDING_TIMEDOUT
+        When the host does start heartbeating, we need to schedule this 
command by changing its state back to PENDING.
+
+        Notes:
         While testing, can update the original_start_time of records in 
host_role_command table to current epoch time.
         E.g. in postgres,
         SELECT CAST(EXTRACT(EPOCH FROM NOW()) AS BIGINT) * 1000;
         UPDATE host_role_command SET attempt_count = 1, status = 
'HOLDING_FAILED', original_start_time = (CAST(EXTRACT(EPOCH FROM NOW()) AS 
BIGINT) * 1000) WHERE task_id IN (x, y, z);
-         */
-        if (canRetryCommand(hrc) && hrc.getOriginalStartTime() > 0 && 
hrc.getOriginalStartTime() < now) {
-          Long retryTimeWindow = hrc.getOriginalStartTime() + MAX_TIMEOUT_MS;
-          Long deltaMS = retryTimeWindow - now;
-
-          if (deltaMS > 0) {
-            String originalStartTimeString = m_fullDateFormat.format(new 
Date(hrc.getOriginalStartTime()));
-            String deltaString = m_deltaDateFormat.format(new Date(deltaMS));
-            LOG.info("Retrying task with id: {}, attempts: {}, original start 
time: {}, time til timeout: {}",
-                hrc.getTaskId(), hrc.getAttemptCount(), 
originalStartTimeString, deltaString);
+        */
+
+        if (canRetryCommand(hrc)) {
+
+          boolean allowRetry = false;
+          // Use-Case 1
+          if (hrc.getOriginalStartTime() != null && hrc.getOriginalStartTime() 
> 0 && hrc.getOriginalStartTime() < now) {
+            Long retryTimeWindow = hrc.getOriginalStartTime() + MAX_TIMEOUT_MS;
+            Long deltaMS = retryTimeWindow - now;
+
+            if (deltaMS > 0) {
+              String originalStartTimeString = m_fullDateFormat.format(new 
Date(hrc.getOriginalStartTime()));
+              String deltaString = m_deltaDateFormat.format(new Date(deltaMS));
+              LOG.info("Retrying task with id: {}, attempts: {}, original 
start time: {}, time til timeout: {}",
+                  hrc.getTaskId(), hrc.getAttemptCount(), 
originalStartTimeString, deltaString);
+              allowRetry = true;
+            }
+          }
+
+          // Use-Case 2
+          if ((hrc.getOriginalStartTime() == null || 
hrc.getOriginalStartTime() == -1L) &&
+              (hrc.getStartTime() == null || hrc.getStartTime() == -1L) &&
+              hrc.getAttemptCount() == 0){
+            LOG.info("Re-scheduling task with id: {} since it has 0 attempts, 
and null start_time and " +
+                    "original_start_time, which likely means the host was not 
heartbeating when the command was supposed to be scheduled.",
+                hrc.getTaskId());
+            allowRetry = true;
+          }
+
+          if (allowRetry) {
             retryHostRoleCommand(hrc);
           }
         }
@@ -262,6 +297,7 @@ public class RetryUpgradeActionService extends 
AbstractScheduledService {
     hrc.setStatus(HostRoleStatus.PENDING);
     hrc.setStartTime(-1L);
     // Don't change the original start time.
+    hrc.setEndTime(-1L);
     hrc.setLastAttemptTime(-1L);
 
     // This will invalidate the cache, as expected.

http://git-wip-us.apache.org/repos/asf/ambari/blob/4690fe7c/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java
 
b/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java
index e699e49..e2ce6e7 100644
--- 
a/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java
+++ 
b/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java
@@ -109,10 +109,12 @@ public class RetryUpgradeActionServiceTest {
    * Case 4: Cluster with an active upgrade that contains a failed task in 
HOLDING_FAILED that
    * does NOT meet conditions to be retried => no-op
    * Case 5: Cluster with an active upgrade that contains a failed task in 
HOLDING_FAILED that
-   * DOES meet conditions to be retried => retries the task
-   * Case 6: Cluster with an active upgrade that contains a failed task in 
HOLDING_FAILED that
+   * DOES meet conditions to be retried and has values for start time and 
original start time => retries the task
+   * * Case 6: Cluster with an active upgrade that contains a failed task in 
HOLDING_TIMEDOUT that
+   * DOES meet conditions to be retriedand does not have values for start time 
or original start time => retries the task
+   * Case 7: Cluster with an active upgrade that contains a failed task in 
HOLDING_FAILED that
    * was already retried and has now expired => no-op
-   * Case 7: Cluster with an active upgrade that contains a failed task in 
HOLDING_FAILED, but it is a critical task
+   * Case 8: Cluster with an active upgrade that contains a failed task in 
HOLDING_FAILED, but it is a critical task
    * during Finalize Cluster, which should not be retried => no-op
    * @throws Exception
    */
@@ -185,7 +187,23 @@ public class RetryUpgradeActionServiceTest {
     // Ensure that task 2 transitioned from HOLDING_FAILED to PENDING
     Assert.assertEquals(HostRoleStatus.PENDING, 
hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus());
 
-    // Case 6: Cluster with an active upgrade that contains a failed task in 
HOLDING_FAILED that was already retried and has now expired.
+    // Case 6: Cluster with an active upgrade that contains a failed task in 
HOLDING_FAILED that DOES meet conditions to be retried.
+    hrc2.setStatus(HostRoleStatus.HOLDING_TIMEDOUT);
+    hrc2.setRetryAllowed(true);
+    hrc2.setOriginalStartTime(-1L);
+    hrc2.setStartTime(-1L);
+    hrc2.setLastAttemptTime(-1L);
+    hrc2.setEndTime(-1L);
+    hrc2.setAttemptCount((short) 0);
+    hostRoleCommandDAO.merge(hrc2);
+
+    // Run the service
+    service.runOneIteration();
+
+    // Ensure that task 2 transitioned from HOLDING_TIMEDOUT to PENDING
+    Assert.assertEquals(HostRoleStatus.PENDING, 
hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus());
+    
+    // Case 7: Cluster with an active upgrade that contains a failed task in 
HOLDING_FAILED that was already retried and has now expired.
     now = System.currentTimeMillis();
     hrc2.setOriginalStartTime(now - (timeoutMins * 60000) - 1);
     hrc2.setStatus(HostRoleStatus.HOLDING_FAILED);
@@ -196,7 +214,7 @@ public class RetryUpgradeActionServiceTest {
 
     Assert.assertEquals(HostRoleStatus.HOLDING_FAILED, 
hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus());
 
-    // Case 7: Cluster with an active upgrade that contains a failed task in 
HOLDING_FAILED, but it is a critical task
+    // Case 8: Cluster with an active upgrade that contains a failed task in 
HOLDING_FAILED, but it is a critical task
     // during Finalize Cluster, which should not be retried.
     now = System.currentTimeMillis();
     hrc2.setOriginalStartTime(now);

Reply via email to