Repository: aurora Updated Branches: refs/heads/master 9c9b592a5 -> 31404487d
Export count-down to forceful Maintenace as a metric. Since the scheduler enforces a maximum timeout on each maintenance request and we now allow CoordinatorSlaPolicy to block maintenance, we need to know which tasks are running into the force maintenance timeout. Export maintenace count down time as a metric brokwen down by task keys. Testing Done: ./gradlew test **Tested in Vagrant** sshanmugham::tw-mbp-sshanmugham {~}$ curl http://192.168.33.7:8081/vars | grep maintenance_countdown ######################################################################## 100.0% maintenance_countdown_ms_vagrant/test/coordinator/0 264523 maintenance_countdown_ms_vagrant/test/coordinator/1 24476 sshanmugham::tw-mbp-sshanmugham {~}$ curl http://192.168.33.7:8081/vars | grep maintenance_countdown ######################################################################## 100.0% maintenance_countdown_ms_vagrant/test/coordinator/0 264523 maintenance_countdown_ms_vagrant/test/coordinator/1 24476 sshanmugham::tw-mbp-sshanmugham {~}$ curl http://192.168.33.7:8081/vars | grep maintenance_countdown ######################################################################## 100.0% maintenance_countdown_ms_vagrant/test/coordinator/0 264523 maintenance_countdown_ms_vagrant/test/coordinator/1 0 Reviewed at https://reviews.apache.org/r/67639/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/31404487 Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/31404487 Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/31404487 Branch: refs/heads/master Commit: 31404487de5518903178061e76723dd10bf43826 Parents: 9c9b592 Author: Santhosh Kumar Shanmugham <santhoshkuma...@gmail.com> Authored: Tue Jun 19 10:31:50 2018 -0700 Committer: Santhosh Kumar <sshanmug...@twitter.com> Committed: Tue Jun 19 10:31:50 2018 -0700 ---------------------------------------------------------------------- .../aurora/scheduler/base/InstanceKeys.java | 11 ++++++ .../maintenance/MaintenanceController.java | 35 ++++++++++++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/31404487/src/main/java/org/apache/aurora/scheduler/base/InstanceKeys.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/base/InstanceKeys.java b/src/main/java/org/apache/aurora/scheduler/base/InstanceKeys.java index b12ac83..c2ef2f6 100644 --- a/src/main/java/org/apache/aurora/scheduler/base/InstanceKeys.java +++ b/src/main/java/org/apache/aurora/scheduler/base/InstanceKeys.java @@ -51,4 +51,15 @@ public final class InstanceKeys { public static String toString(IInstanceKey instance) { return JobKeys.canonicalString(instance.getJobKey()) + "/" + instance.getInstanceId(); } + + /** + * Creates a human-friendly string for an instance key. + * + * @param job Job key. + * @param instanceId Instance id. + * @return String representation of the instance key. + */ + public static String toString(IJobKey job, int instanceId) { + return toString(from(job, instanceId)); + } } http://git-wip-us.apache.org/repos/asf/aurora/blob/31404487/src/main/java/org/apache/aurora/scheduler/maintenance/MaintenanceController.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/maintenance/MaintenanceController.java b/src/main/java/org/apache/aurora/scheduler/maintenance/MaintenanceController.java index 7fc5990..626a682 100644 --- a/src/main/java/org/apache/aurora/scheduler/maintenance/MaintenanceController.java +++ b/src/main/java/org/apache/aurora/scheduler/maintenance/MaintenanceController.java @@ -20,6 +20,7 @@ import java.util.Collections; import java.util.List; import java.util.Optional; import java.util.Set; +import java.util.concurrent.ExecutionException; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; import javax.inject.Inject; @@ -27,7 +28,11 @@ import javax.inject.Qualifier; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; +import com.google.common.base.Joiner; import com.google.common.base.Predicates; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Sets; @@ -46,6 +51,7 @@ import org.apache.aurora.gen.ScheduleStatus; import org.apache.aurora.gen.SlaPolicy; import org.apache.aurora.scheduler.BatchWorker; import org.apache.aurora.scheduler.SchedulerModule.TaskEventBatchWorker; +import org.apache.aurora.scheduler.base.InstanceKeys; import org.apache.aurora.scheduler.base.Query; import org.apache.aurora.scheduler.base.Tasks; import org.apache.aurora.scheduler.config.types.TimeAmount; @@ -171,6 +177,7 @@ public interface MaintenanceController { @VisibleForTesting static final String DRAINING_MESSAGE = "Draining machine for maintenance."; + private static final String MAINTENANCE_COUNTDOWN_STAT_NAME = "maintenance_countdown_ms"; private static final String MISSING_MAINTENANCE_REQUEST = "missing_maintenance_request"; private static final SlaPolicy ZERO_PERCENT_SLA = SlaPolicy.percentageSlaPolicy( new PercentageSlaPolicy() @@ -184,6 +191,7 @@ public interface MaintenanceController { private final StateManager stateManager; private final AtomicLong missingMaintenanceCounter; + private final LoadingCache<String, AtomicLong> maintenanceCountDownByTask; @Inject public MaintenanceControllerImpl( @@ -200,6 +208,14 @@ public interface MaintenanceController { this.slaManager = requireNonNull(slaManager); this.stateManager = requireNonNull(stateManager); this.missingMaintenanceCounter = statsProvider.makeCounter(MISSING_MAINTENANCE_REQUEST); + this.maintenanceCountDownByTask = CacheBuilder.newBuilder().build( + new CacheLoader<String, AtomicLong>() { + @Override + public AtomicLong load(String key) { + return statsProvider.makeCounter(key); + } + } + ); } private Set<String> drainTasksOnHost(String host, StoreProvider store) { @@ -214,7 +230,13 @@ public interface MaintenanceController { // shuffle the candidates to avoid head-of-line blocking Collections.shuffle(candidates); - candidates.forEach(task -> drainTask(task, store)); + candidates.forEach(task -> { + try { + drainTask(task, store); + } catch (ExecutionException e) { + LOG.error("Exception when trying to drain task: {}", Tasks.id(task), e); + } + }); return candidates.stream().map(Tasks::id).collect(Collectors.toSet()); } @@ -432,7 +454,7 @@ public interface MaintenanceController { pollingInterval.getUnit().getTimeUnit()); } - private void drainTask(IScheduledTask task, StoreProvider store) { + private void drainTask(IScheduledTask task, StoreProvider store) throws ExecutionException { String host = task.getAssignedTask().getSlaveHost(); Optional<IHostMaintenanceRequest> hostMaintenanceRequest = store.getHostMaintenanceStore().getHostMaintenanceRequest(host); @@ -445,6 +467,15 @@ public interface MaintenanceController { boolean force = false; long expireMs = System.currentTimeMillis() - hostMaintenanceRequest.get().getCreatedTimestampMs(); + long maintenanceCountDownMs = + TimeAmount.of(hostMaintenanceRequest.get().getTimeoutSecs(), Time.SECONDS) + .as(Time.MILLISECONDS) - expireMs; + maintenanceCountDownByTask.get( + Joiner.on("_") + .join(MAINTENANCE_COUNTDOWN_STAT_NAME, + InstanceKeys.toString(Tasks.getJob(task), Tasks.getInstanceId(task)))) + .getAndSet(maintenanceCountDownMs); + if (hostMaintenanceRequest.get().getTimeoutSecs() < TimeAmount.of(expireMs, Time.MILLISECONDS).as(Time.SECONDS)) { LOG.warn("Maintenance request timed out for host: {} after {} secs. Forcing drain of {}.",