Repository: aurora Updated Branches: refs/heads/master 4719fa730 -> 9c9b592a5
Export number of tasks lost per dedicated role. When there are 100s of dedicated roles in a cluster the task_LOST_<job> metric is not enough. Introduce per dedicated role metric for easier diagnosis. Testing Done: ./gradlew test **Tested on Vagrant** tasks_lost_dedicated____web.multi 0 tasks_lost_dedicated_vagrant 2 Reviewed at https://reviews.apache.org/r/67638/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/9c9b592a Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/9c9b592a Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/9c9b592a Branch: refs/heads/master Commit: 9c9b592a55f86cee7a3ed946710e92037ea9b0d9 Parents: 4719fa7 Author: Santhosh Kumar Shanmugham <[email protected]> Authored: Mon Jun 18 16:39:34 2018 -0700 Committer: Santhosh Kumar <[email protected]> Committed: Mon Jun 18 16:39:47 2018 -0700 ---------------------------------------------------------------------- .../org/apache/aurora/scheduler/TaskVars.java | 53 +++++++++++++++----- .../apache/aurora/scheduler/TaskVarsTest.java | 44 +++++++++++++--- 2 files changed, 77 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/9c9b592a/src/main/java/org/apache/aurora/scheduler/TaskVars.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/TaskVars.java b/src/main/java/org/apache/aurora/scheduler/TaskVars.java index ee20ed3..8bb5f3e 100644 --- a/src/main/java/org/apache/aurora/scheduler/TaskVars.java +++ b/src/main/java/org/apache/aurora/scheduler/TaskVars.java @@ -17,7 +17,7 @@ import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; - +import java.util.stream.StreamSupport; import javax.inject.Inject; import com.google.common.annotations.VisibleForTesting; @@ -28,7 +28,6 @@ import com.google.common.base.Supplier; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; -import com.google.common.collect.FluentIterable; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; @@ -114,6 +113,11 @@ public class TaskVars extends AbstractIdleService implements EventSubscriber { } @VisibleForTesting + static String dedicatedRoleStatName(String role) { + return "tasks_lost_dedicated_" + role.replace("*", "_"); + } + + @VisibleForTesting static String jobStatName(IScheduledTask task, ScheduleStatus status) { return String.format( "tasks_%s_%s", @@ -144,20 +148,18 @@ public class TaskVars extends AbstractIdleService implements EventSubscriber { if (Strings.isNullOrEmpty(task.getAssignedTask().getSlaveHost())) { rack = Optional.empty(); } else { - rack = storage.read(storeProvider -> { - Optional<IAttribute> rack1 = FluentIterable - .from(AttributeStore.Util.attributesOrNone(storeProvider, host)) - .firstMatch(IS_RACK) - .toJavaUtil(); - return rack1.map(ATTR_VALUE); - }); + rack = storage.read(storeProvider -> + StreamSupport.stream( + AttributeStore.Util.attributesOrNone(storeProvider, host).spliterator(), + false) + .filter(IS_RACK) + .findFirst() + .map(ATTR_VALUE)); } // Always dummy-read the lost-tasks-per-rack stat. This ensures that there is at least a zero // exported for all racks. - if (rack.isPresent()) { - counters.getUnchecked(rackStatName(rack.get())); - } + rack.ifPresent(s -> counters.getUnchecked(rackStatName(s))); if (newState == ScheduleStatus.LOST) { if (rack.isPresent()) { @@ -168,6 +170,32 @@ public class TaskVars extends AbstractIdleService implements EventSubscriber { } } + private void updateDedicatedCounters(IScheduledTask task, ScheduleStatus newState) { + final String host = task.getAssignedTask().getSlaveHost(); + ImmutableSet<String> dedicatedRoles; + if (Strings.isNullOrEmpty(host)) { + dedicatedRoles = ImmutableSet.of(); + } else { + dedicatedRoles = storage.read(store -> + StreamSupport.stream( + AttributeStore.Util.attributesOrNone(store, host).spliterator(), + false) + .filter(attr -> "dedicated".equals(attr.getName())) + .findFirst() + .map(IAttribute::getValues) + .orElse(ImmutableSet.of()) + ); + } + + // Always dummy-read the lost-tasks-per-role stat. This ensures that there is at least a zero + // exported for all roles. + dedicatedRoles.forEach(s -> counters.getUnchecked(dedicatedRoleStatName(s))); + + if (newState == ScheduleStatus.LOST) { + dedicatedRoles.forEach(s -> counters.getUnchecked(dedicatedRoleStatName(s)).increment()); + } + } + private void updateJobCounters(IScheduledTask task, ScheduleStatus newState) { if (TRACKED_JOB_STATES.contains(newState)) { untrackedCounters.getUnchecked(jobStatName(task, newState)).increment(); @@ -186,6 +214,7 @@ public class TaskVars extends AbstractIdleService implements EventSubscriber { updateRackCounters(task, task.getStatus()); updateJobCounters(task, task.getStatus()); + updateDedicatedCounters(task, task.getStatus()); } @Override http://git-wip-us.apache.org/repos/asf/aurora/blob/9c9b592a/src/test/java/org/apache/aurora/scheduler/TaskVarsTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/aurora/scheduler/TaskVarsTest.java b/src/test/java/org/apache/aurora/scheduler/TaskVarsTest.java index 6321ec0..2bae69d 100644 --- a/src/test/java/org/apache/aurora/scheduler/TaskVarsTest.java +++ b/src/test/java/org/apache/aurora/scheduler/TaskVarsTest.java @@ -53,6 +53,7 @@ import static org.apache.aurora.gen.ScheduleStatus.PENDING; import static org.apache.aurora.gen.ScheduleStatus.RUNNING; import static org.apache.aurora.scheduler.TaskVars.VETO_GROUPS_TO_COUNTERS; import static org.apache.aurora.scheduler.TaskVars.VETO_TYPE_TO_COUNTERS; +import static org.apache.aurora.scheduler.TaskVars.dedicatedRoleStatName; import static org.apache.aurora.scheduler.TaskVars.jobStatName; import static org.apache.aurora.scheduler.TaskVars.rackStatName; import static org.easymock.EasyMock.expect; @@ -185,8 +186,11 @@ public class TaskVarsTest extends EasyMockTest { expectStatusCountersInitialized(); IScheduledTask taskA = makeTask(JOB_A, INIT); - expectGetHostRack("hostA", "rackA").atLeastOnce(); + expectGetHostAttributes("hostA", "rackA", ImmutableSet.of("role-1/job-1", "role.2/job.2")) + .atLeastOnce(); expectStatExport(rackStatName("rackA")); + expectStatExport(dedicatedRoleStatName("role-1/job-1")); + expectStatExport(dedicatedRoleStatName("role.2/job.2")); replayAndBuild(); schedulerActivated(); @@ -204,6 +208,8 @@ public class TaskVarsTest extends EasyMockTest { assertEquals(0, getValue(RUNNING)); assertEquals(1, getValue(FINISHED)); assertEquals(0, getValue(rackStatName("rackA"))); + assertEquals(0, getValue(dedicatedRoleStatName("role-1/job-1"))); + assertEquals(0, getValue(dedicatedRoleStatName("role.2/job.2"))); vars.tasksDeleted(new TasksDeleted(ImmutableSet.of( IScheduledTask.build(taskA.newBuilder().setStatus(FINISHED))))); assertAllZero(); @@ -260,10 +266,12 @@ public class TaskVarsTest extends EasyMockTest { @Test public void testLoadsFromStorage() { expectStatusCountersInitialized(); - expectGetHostRack("hostA", "rackA").atLeastOnce(); - expectGetHostRack("hostB", "rackB").atLeastOnce(); + expectGetHostAttributes("hostA", "rackA", ImmutableSet.of("role-1/job-1")).atLeastOnce(); + expectGetHostAttributes("hostB", "rackB", ImmutableSet.of("role.2/job.2")).atLeastOnce(); expectStatExport(rackStatName("rackA")); expectStatExport(rackStatName("rackB")); + expectStatExport(dedicatedRoleStatName("role-1/job-1")); + expectStatExport(dedicatedRoleStatName("role.2/job.2")); IScheduledTask failedTask = makeTask(JOB_B, FAILED, "hostB"); expectStatExport(jobStatName(failedTask, FAILED), untrackedProvider); @@ -282,14 +290,27 @@ public class TaskVarsTest extends EasyMockTest { assertEquals(1, getValue(FAILED)); assertEquals(0, getValue(rackStatName("rackA"))); assertEquals(0, getValue(rackStatName("rackB"))); + assertEquals(0, getValue(dedicatedRoleStatName("role-1/job-1"))); + assertEquals(0, getValue(dedicatedRoleStatName("role.2/job.2"))); assertEquals(1, getValue(jobStatName(failedTask, FAILED))); } - private IExpectationSetters<?> expectGetHostRack(String host, String rackToReturn) { + private IExpectationSetters<?> expectGetHostAttributes( + String host, + String rackToReturn) { + return expectGetHostAttributes(host, rackToReturn, ImmutableSet.of()); + } + + private IExpectationSetters<?> expectGetHostAttributes( + String host, + String rackToReturn, + ImmutableSet<String> dedicatedAttrs) { + IHostAttributes attributes = IHostAttributes.build(new HostAttributes() .setHost(host) .setAttributes(ImmutableSet.of( - new Attribute().setName("rack").setValues(ImmutableSet.of(rackToReturn))))); + new Attribute().setName("rack").setValues(ImmutableSet.of(rackToReturn)), + new Attribute().setName("dedicated").setValues(dedicatedAttrs)))); return expect(storageUtil.attributeStore.getHostAttributes(host)) .andReturn(Optional.of(attributes)); } @@ -297,12 +318,14 @@ public class TaskVarsTest extends EasyMockTest { @Test public void testLostCounters() { expectStatusCountersInitialized(); - expectGetHostRack("host1", "rackA").atLeastOnce(); - expectGetHostRack("host2", "rackB").atLeastOnce(); - expectGetHostRack("host3", "rackB").atLeastOnce(); + expectGetHostAttributes("host1", "rackA", ImmutableSet.of("role-1/job-1")).atLeastOnce(); + expectGetHostAttributes("host2", "rackB", ImmutableSet.of("role.2/job.2")).atLeastOnce(); + expectGetHostAttributes("host3", "rackB").atLeastOnce(); expectStatExport(rackStatName("rackA")); expectStatExport(rackStatName("rackB")); + expectStatExport(dedicatedRoleStatName("role-1/job-1")); + expectStatExport(dedicatedRoleStatName("role.2/job.2")); IScheduledTask a = makeTask(JOB_A, RUNNING, "host1"); IScheduledTask b = makeTask(JOB_B, RUNNING, "host2"); @@ -325,6 +348,9 @@ public class TaskVarsTest extends EasyMockTest { assertEquals(2, getValue(rackStatName("rackA"))); assertEquals(2, getValue(rackStatName("rackB"))); + assertEquals(2, getValue(dedicatedRoleStatName("role-1/job-1"))); + assertEquals(1, getValue(dedicatedRoleStatName("role.2/job.2"))); + assertEquals(1, getValue(jobStatName(a, LOST))); assertEquals(1, getValue(jobStatName(b, LOST))); assertEquals(2, getValue(jobStatName(c, LOST))); @@ -335,6 +361,8 @@ public class TaskVarsTest extends EasyMockTest { expectStatusCountersInitialized(); expect(storageUtil.attributeStore.getHostAttributes("a")) .andReturn(Optional.empty()); + expect(storageUtil.attributeStore.getHostAttributes("a")) + .andReturn(Optional.empty()); IScheduledTask a = makeTask(JOB_A, RUNNING, "a"); expectStatExport(jobStatName(a, LOST), untrackedProvider);
