Repository: aurora
Updated Branches:
  refs/heads/master 9c9b592a5 -> 31404487d


Export count-down to forceful Maintenace as a metric.

Since the scheduler enforces a maximum timeout on each
maintenance request and we now allow CoordinatorSlaPolicy
to block maintenance, we need to know which tasks are
running into the force maintenance timeout. Export maintenace
count down time as a metric brokwen down by task keys.

Testing Done:
./gradlew test

**Tested in Vagrant**
sshanmugham::tw-mbp-sshanmugham {~}$ curl http://192.168.33.7:8081/vars | grep 
maintenance_countdown
######################################################################## 100.0%
maintenance_countdown_ms_vagrant/test/coordinator/0 264523
maintenance_countdown_ms_vagrant/test/coordinator/1 24476
sshanmugham::tw-mbp-sshanmugham {~}$ curl http://192.168.33.7:8081/vars | grep 
maintenance_countdown
######################################################################## 100.0%
maintenance_countdown_ms_vagrant/test/coordinator/0 264523
maintenance_countdown_ms_vagrant/test/coordinator/1 24476
sshanmugham::tw-mbp-sshanmugham {~}$ curl http://192.168.33.7:8081/vars | grep 
maintenance_countdown
######################################################################## 100.0%
maintenance_countdown_ms_vagrant/test/coordinator/0 264523
maintenance_countdown_ms_vagrant/test/coordinator/1 0

Reviewed at https://reviews.apache.org/r/67639/


Project: http://git-wip-us.apache.org/repos/asf/aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/31404487
Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/31404487
Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/31404487

Branch: refs/heads/master
Commit: 31404487de5518903178061e76723dd10bf43826
Parents: 9c9b592
Author: Santhosh Kumar Shanmugham <santhoshkuma...@gmail.com>
Authored: Tue Jun 19 10:31:50 2018 -0700
Committer: Santhosh Kumar <sshanmug...@twitter.com>
Committed: Tue Jun 19 10:31:50 2018 -0700

----------------------------------------------------------------------
 .../aurora/scheduler/base/InstanceKeys.java     | 11 ++++++
 .../maintenance/MaintenanceController.java      | 35 ++++++++++++++++++--
 2 files changed, 44 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/aurora/blob/31404487/src/main/java/org/apache/aurora/scheduler/base/InstanceKeys.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/aurora/scheduler/base/InstanceKeys.java 
b/src/main/java/org/apache/aurora/scheduler/base/InstanceKeys.java
index b12ac83..c2ef2f6 100644
--- a/src/main/java/org/apache/aurora/scheduler/base/InstanceKeys.java
+++ b/src/main/java/org/apache/aurora/scheduler/base/InstanceKeys.java
@@ -51,4 +51,15 @@ public final class InstanceKeys {
   public static String toString(IInstanceKey instance) {
     return JobKeys.canonicalString(instance.getJobKey()) + "/" + 
instance.getInstanceId();
   }
+
+  /**
+   * Creates a human-friendly string for an instance key.
+   *
+   * @param job Job key.
+   * @param instanceId Instance id.
+   * @return String representation of the instance key.
+   */
+  public static String toString(IJobKey job, int instanceId) {
+    return toString(from(job, instanceId));
+  }
 }

http://git-wip-us.apache.org/repos/asf/aurora/blob/31404487/src/main/java/org/apache/aurora/scheduler/maintenance/MaintenanceController.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/aurora/scheduler/maintenance/MaintenanceController.java
 
b/src/main/java/org/apache/aurora/scheduler/maintenance/MaintenanceController.java
index 7fc5990..626a682 100644
--- 
a/src/main/java/org/apache/aurora/scheduler/maintenance/MaintenanceController.java
+++ 
b/src/main/java/org/apache/aurora/scheduler/maintenance/MaintenanceController.java
@@ -20,6 +20,7 @@ import java.util.Collections;
 import java.util.List;
 import java.util.Optional;
 import java.util.Set;
+import java.util.concurrent.ExecutionException;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Collectors;
 import javax.inject.Inject;
@@ -27,7 +28,11 @@ import javax.inject.Qualifier;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Function;
+import com.google.common.base.Joiner;
 import com.google.common.base.Predicates;
+import com.google.common.cache.CacheBuilder;
+import com.google.common.cache.CacheLoader;
+import com.google.common.cache.LoadingCache;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
@@ -46,6 +51,7 @@ import org.apache.aurora.gen.ScheduleStatus;
 import org.apache.aurora.gen.SlaPolicy;
 import org.apache.aurora.scheduler.BatchWorker;
 import org.apache.aurora.scheduler.SchedulerModule.TaskEventBatchWorker;
+import org.apache.aurora.scheduler.base.InstanceKeys;
 import org.apache.aurora.scheduler.base.Query;
 import org.apache.aurora.scheduler.base.Tasks;
 import org.apache.aurora.scheduler.config.types.TimeAmount;
@@ -171,6 +177,7 @@ public interface MaintenanceController {
     @VisibleForTesting
     static final String DRAINING_MESSAGE = "Draining machine for maintenance.";
 
+    private static final String MAINTENANCE_COUNTDOWN_STAT_NAME = 
"maintenance_countdown_ms";
     private static final String MISSING_MAINTENANCE_REQUEST = 
"missing_maintenance_request";
     private static final SlaPolicy ZERO_PERCENT_SLA = 
SlaPolicy.percentageSlaPolicy(
         new PercentageSlaPolicy()
@@ -184,6 +191,7 @@ public interface MaintenanceController {
     private final StateManager stateManager;
 
     private final AtomicLong missingMaintenanceCounter;
+    private final LoadingCache<String, AtomicLong> maintenanceCountDownByTask;
 
     @Inject
     public MaintenanceControllerImpl(
@@ -200,6 +208,14 @@ public interface MaintenanceController {
       this.slaManager = requireNonNull(slaManager);
       this.stateManager = requireNonNull(stateManager);
       this.missingMaintenanceCounter = 
statsProvider.makeCounter(MISSING_MAINTENANCE_REQUEST);
+      this.maintenanceCountDownByTask = CacheBuilder.newBuilder().build(
+          new CacheLoader<String, AtomicLong>() {
+            @Override
+            public AtomicLong load(String key) {
+              return statsProvider.makeCounter(key);
+            }
+          }
+      );
     }
 
     private Set<String> drainTasksOnHost(String host, StoreProvider store) {
@@ -214,7 +230,13 @@ public interface MaintenanceController {
 
       // shuffle the candidates to avoid head-of-line blocking
       Collections.shuffle(candidates);
-      candidates.forEach(task -> drainTask(task, store));
+      candidates.forEach(task -> {
+        try {
+          drainTask(task, store);
+        } catch (ExecutionException e) {
+          LOG.error("Exception when trying to drain task: {}", Tasks.id(task), 
e);
+        }
+      });
 
       return candidates.stream().map(Tasks::id).collect(Collectors.toSet());
     }
@@ -432,7 +454,7 @@ public interface MaintenanceController {
           pollingInterval.getUnit().getTimeUnit());
     }
 
-    private void drainTask(IScheduledTask task, StoreProvider store) {
+    private void drainTask(IScheduledTask task, StoreProvider store) throws 
ExecutionException {
       String host = task.getAssignedTask().getSlaveHost();
       Optional<IHostMaintenanceRequest> hostMaintenanceRequest =
           store.getHostMaintenanceStore().getHostMaintenanceRequest(host);
@@ -445,6 +467,15 @@ public interface MaintenanceController {
       boolean force = false;
       long expireMs =
           System.currentTimeMillis() - 
hostMaintenanceRequest.get().getCreatedTimestampMs();
+      long maintenanceCountDownMs =
+          TimeAmount.of(hostMaintenanceRequest.get().getTimeoutSecs(), 
Time.SECONDS)
+              .as(Time.MILLISECONDS) - expireMs;
+      maintenanceCountDownByTask.get(
+          Joiner.on("_")
+              .join(MAINTENANCE_COUNTDOWN_STAT_NAME,
+                  InstanceKeys.toString(Tasks.getJob(task), 
Tasks.getInstanceId(task))))
+          .getAndSet(maintenanceCountDownMs);
+
       if (hostMaintenanceRequest.get().getTimeoutSecs()
             < TimeAmount.of(expireMs, Time.MILLISECONDS).as(Time.SECONDS)) {
         LOG.warn("Maintenance request timed out for host: {} after {} secs. 
Forcing drain of {}.",

Reply via email to