Repository: aurora Updated Branches: refs/heads/master 82aa097f1 -> 318b40d71
Compute SLA stats for non-prod jobs Testing Done: `./gradlew -Pq build` and a manual verification in Vagrant. Bugs closed: AURORA-1350 Reviewed at https://reviews.apache.org/r/35498/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/318b40d7 Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/318b40d7 Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/318b40d7 Branch: refs/heads/master Commit: 318b40d71d7075bdf8364f1fecdfb1d92af0d8b3 Parents: 82aa097 Author: Stephan Erb <[email protected]> Authored: Mon Jul 6 11:11:48 2015 -0700 Committer: Kevin Sweeney <[email protected]> Committed: Mon Jul 6 11:11:48 2015 -0700 ---------------------------------------------------------------------- NEWS | 3 + docs/sla.md | 7 +- .../aurora/scheduler/sla/MetricCalculator.java | 104 +++++++++++-------- .../aurora/scheduler/sla/SlaAlgorithm.java | 4 +- .../apache/aurora/scheduler/sla/SlaModule.java | 34 +++++- .../scheduler/sla/MetricCalculatorTest.java | 31 +++++- .../aurora/scheduler/sla/SlaModuleTest.java | 9 +- 7 files changed, 135 insertions(+), 57 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/318b40d7/NEWS ---------------------------------------------------------------------- diff --git a/NEWS b/NEWS index fd5b708..d780d8c 100644 --- a/NEWS +++ b/NEWS @@ -6,3 +6,6 @@ - The scheduler command line argument 'enable_legacy_constraints' has been removed, and the scheduler no longer automatically injects 'host' and 'rack' constraints for production services. (AURORA-1074) +- SLA metrics for non-production jobs have been disabled by default. They can + be enabled via the scheduler command line. Metric names have changed from + '...nonprod_ms' to "...ms_nonprod" (AURORA-1350). http://git-wip-us.apache.org/repos/asf/aurora/blob/318b40d7/docs/sla.md ---------------------------------------------------------------------- diff --git a/docs/sla.md b/docs/sla.md index 14e9108..a558e00 100644 --- a/docs/sla.md +++ b/docs/sla.md @@ -15,8 +15,9 @@ The primary goal of the feature is collection and monitoring of Aurora job SLA ( Agreements) metrics that defining a contractual relationship between the Aurora/Mesos platform and hosted services. -The Aurora SLA feature currently supports stat collection only for service (non-cron) -production jobs (`"production = True"` in your `.aurora` config). +The Aurora SLA feature is by default only enabled for service (non-cron) +production jobs (`"production = True"` in your `.aurora` config). It can be enabled for +non-production services via the scheduler command line flag `-sla_non_prod_metrics`. Counters that track SLA measurements are computed periodically within the scheduler. The individual instance metrics are refreshed every minute (configurable via @@ -173,4 +174,4 @@ unreasonable resource constraints) do not affect metric curves. * The availability of Aurora SLA metrics is bound by the scheduler availability. * All metrics are calculated at a pre-defined interval (currently set at 1 minute). - Scheduler restarts may result in missed collections. \ No newline at end of file + Scheduler restarts may result in missed collections. http://git-wip-us.apache.org/repos/asf/aurora/blob/318b40d7/src/main/java/org/apache/aurora/scheduler/sla/MetricCalculator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/sla/MetricCalculator.java b/src/main/java/org/apache/aurora/scheduler/sla/MetricCalculator.java index 82f36d5..7dca574 100644 --- a/src/main/java/org/apache/aurora/scheduler/sla/MetricCalculator.java +++ b/src/main/java/org/apache/aurora/scheduler/sla/MetricCalculator.java @@ -16,6 +16,7 @@ package org.apache.aurora.scheduler.sla; import java.util.Collection; import java.util.List; import java.util.Map.Entry; +import java.util.Set; import java.util.concurrent.atomic.AtomicReference; import javax.inject.Inject; @@ -65,36 +66,41 @@ import static org.apache.aurora.scheduler.sla.SlaGroup.GroupType.RESOURCE_RAM; class MetricCalculator implements Runnable { @VisibleForTesting - static final Multimap<AlgorithmType, GroupType> PROD_METRICS = - ImmutableMultimap.<AlgorithmType, GroupType>builder() - .put(JOB_UPTIME_50, JOB) - .put(JOB_UPTIME_75, JOB) - .put(JOB_UPTIME_90, JOB) - .put(JOB_UPTIME_95, JOB) - .put(JOB_UPTIME_99, JOB) - .putAll(AGGREGATE_PLATFORM_UPTIME, JOB, CLUSTER) - .putAll(MEDIAN_TIME_TO_ASSIGNED, JOB, CLUSTER, RESOURCE_CPU, RESOURCE_RAM, RESOURCE_DISK) - .putAll(MEDIAN_TIME_TO_RUNNING, JOB, CLUSTER, RESOURCE_CPU, RESOURCE_RAM, RESOURCE_DISK) - .build(); + static final String NAME_QUALIFIER_PROD = ""; @VisibleForTesting - static final Multimap<AlgorithmType, GroupType> NON_PROD_METRICS = - ImmutableMultimap.<AlgorithmType, GroupType>builder() - .putAll( - AlgorithmType.MEDIAN_TIME_TO_ASSIGNED_NON_PROD, - JOB, - CLUSTER, - RESOURCE_CPU, - RESOURCE_RAM, - RESOURCE_DISK) - .putAll( - AlgorithmType.MEDIAN_TIME_TO_RUNNING_NON_PROD, - JOB, - CLUSTER, - RESOURCE_CPU, - RESOURCE_RAM, - RESOURCE_DISK) - .build(); + static final String NAME_QUALIFIER_NON_PROD = "_nonprod"; + + /** + * Pre-configured categories of metrics. + */ + enum MetricCategory { + + JOB_UPTIMES(ImmutableMultimap.<AlgorithmType, GroupType>builder() + .put(JOB_UPTIME_50, JOB) + .put(JOB_UPTIME_75, JOB) + .put(JOB_UPTIME_90, JOB) + .put(JOB_UPTIME_95, JOB) + .put(JOB_UPTIME_99, JOB) + .build()), + PLATFORM_UPTIME(ImmutableMultimap.<AlgorithmType, GroupType>builder() + .putAll(AGGREGATE_PLATFORM_UPTIME, JOB, CLUSTER) + .build()), + MEDIANS(ImmutableMultimap.<AlgorithmType, GroupType>builder() + .putAll(MEDIAN_TIME_TO_ASSIGNED, JOB, CLUSTER, RESOURCE_CPU, RESOURCE_RAM, RESOURCE_DISK) + .putAll(MEDIAN_TIME_TO_RUNNING, JOB, CLUSTER, RESOURCE_CPU, RESOURCE_RAM, RESOURCE_DISK) + .build()); + + private final Multimap<AlgorithmType, GroupType> metrics; + + MetricCategory(Multimap<AlgorithmType, GroupType> metrics) { + this.metrics = metrics; + } + + Multimap<AlgorithmType, GroupType> getMetrics() { + return metrics; + } + } private static final Predicate<ITaskConfig> IS_SERVICE = new Predicate<ITaskConfig>() { @@ -111,14 +117,23 @@ class MetricCalculator implements Runnable { static class MetricCalculatorSettings { private final long refreshRateMs; + private final Set<MetricCategory> prodMetrics; + private final Set<MetricCategory> nonProdMetrics; + + MetricCalculatorSettings( + long refreshRateMs, + Set<MetricCategory> prodMetrics, + Set<MetricCategory> nonProdMetrics) { - MetricCalculatorSettings(long refreshRateMs) { this.refreshRateMs = refreshRateMs; + this.prodMetrics = requireNonNull(prodMetrics); + this.nonProdMetrics = requireNonNull(nonProdMetrics); } long getRefreshRateMs() { return refreshRateMs; } + } private static class Counter implements Supplier<Number> { @@ -179,25 +194,28 @@ class MetricCalculator implements Runnable { Tasks.SCHEDULED_TO_INFO)).toList(); long nowMs = clock.nowMillis(); - Range<Long> timeRange = Range.closedOpen(nowMs - settings.getRefreshRateMs(), nowMs); + Range<Long> timeRange = Range.closedOpen(nowMs - settings.refreshRateMs, nowMs); - runAlgorithms(prodTasks, PROD_METRICS, timeRange); - runAlgorithms(nonProdTasks, NON_PROD_METRICS, timeRange); + runAlgorithms(prodTasks, settings.prodMetrics, timeRange, NAME_QUALIFIER_PROD); + runAlgorithms(nonProdTasks, settings.nonProdMetrics, timeRange, NAME_QUALIFIER_NON_PROD); } private void runAlgorithms( List<IScheduledTask> tasks, - Multimap<AlgorithmType, GroupType> metrics, - Range<Long> timeRange) { - - for (Entry<AlgorithmType, GroupType> slaMetric : metrics.entries()) { - for (Entry<String, Collection<IScheduledTask>> namedGroup - : slaMetric.getValue().getSlaGroup().createNamedGroups(tasks).asMap().entrySet()) { - - AlgorithmType algoType = slaMetric.getKey(); - String metricName = namedGroup.getKey() + algoType.getAlgorithmName(); - metricCache.getUnchecked(metricName) - .set(metricName, algoType.getAlgorithm().calculate(namedGroup.getValue(), timeRange)); + Set<MetricCategory> categories, + Range<Long> timeRange, + String nameQualifier) { + + for (MetricCategory category : categories) { + for (Entry<AlgorithmType, GroupType> slaMetric : category.getMetrics().entries()) { + for (Entry<String, Collection<IScheduledTask>> namedGroup + : slaMetric.getValue().getSlaGroup().createNamedGroups(tasks).asMap().entrySet()) { + + AlgorithmType algoType = slaMetric.getKey(); + String metricName = namedGroup.getKey() + algoType.getAlgorithmName() + nameQualifier; + metricCache.getUnchecked(metricName) + .set(metricName, algoType.getAlgorithm().calculate(namedGroup.getValue(), timeRange)); + } } } } http://git-wip-us.apache.org/repos/asf/aurora/blob/318b40d7/src/main/java/org/apache/aurora/scheduler/sla/SlaAlgorithm.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/sla/SlaAlgorithm.java b/src/main/java/org/apache/aurora/scheduler/sla/SlaAlgorithm.java index ff73ca6..42967df 100644 --- a/src/main/java/org/apache/aurora/scheduler/sla/SlaAlgorithm.java +++ b/src/main/java/org/apache/aurora/scheduler/sla/SlaAlgorithm.java @@ -72,9 +72,7 @@ interface SlaAlgorithm { JOB_UPTIME_50(new JobUptime(50f), String.format(JobUptime.NAME_FORMAT, 50f)), AGGREGATE_PLATFORM_UPTIME(new AggregatePlatformUptime(), "platform_uptime_percent"), MEDIAN_TIME_TO_ASSIGNED(new MedianAlgorithm(ASSIGNED), "mtta_ms"), - MEDIAN_TIME_TO_RUNNING(new MedianAlgorithm(RUNNING), "mttr_ms"), - MEDIAN_TIME_TO_ASSIGNED_NON_PROD(new MedianAlgorithm(ASSIGNED), "mtta_nonprod_ms"), - MEDIAN_TIME_TO_RUNNING_NON_PROD(new MedianAlgorithm(RUNNING), "mttr_nonprod_ms"); + MEDIAN_TIME_TO_RUNNING(new MedianAlgorithm(RUNNING), "mttr_ms"); private final SlaAlgorithm algorithm; private final String name; http://git-wip-us.apache.org/repos/asf/aurora/blob/318b40d7/src/main/java/org/apache/aurora/scheduler/sla/SlaModule.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/sla/SlaModule.java b/src/main/java/org/apache/aurora/scheduler/sla/SlaModule.java index 64e986f..384dcf5 100644 --- a/src/main/java/org/apache/aurora/scheduler/sla/SlaModule.java +++ b/src/main/java/org/apache/aurora/scheduler/sla/SlaModule.java @@ -15,6 +15,7 @@ package org.apache.aurora.scheduler.sla; import java.lang.annotation.Retention; import java.lang.annotation.Target; +import java.util.Set; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.logging.Logger; @@ -23,6 +24,7 @@ import javax.inject.Inject; import javax.inject.Qualifier; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableSet; import com.google.common.util.concurrent.AbstractIdleService; import com.google.inject.AbstractModule; import com.google.inject.Singleton; @@ -35,6 +37,7 @@ import com.twitter.common.quantity.Time; import org.apache.aurora.scheduler.SchedulerServicesModule; import org.apache.aurora.scheduler.base.AsyncUtil; import org.apache.aurora.scheduler.sla.MetricCalculator.MetricCalculatorSettings; +import org.apache.aurora.scheduler.sla.MetricCalculator.MetricCategory; import static java.lang.annotation.ElementType.FIELD; import static java.lang.annotation.ElementType.METHOD; @@ -42,6 +45,10 @@ import static java.lang.annotation.ElementType.PARAMETER; import static java.lang.annotation.RetentionPolicy.RUNTIME; import static java.util.Objects.requireNonNull; +import static org.apache.aurora.scheduler.sla.MetricCalculator.MetricCategory.JOB_UPTIMES; +import static org.apache.aurora.scheduler.sla.MetricCalculator.MetricCategory.MEDIANS; +import static org.apache.aurora.scheduler.sla.MetricCalculator.MetricCategory.PLATFORM_UPTIME; + /** * Binding module for the sla processor. */ @@ -54,26 +61,47 @@ public class SlaModule extends AbstractModule { private static final Arg<Amount<Long, Time>> SLA_REFRESH_INTERVAL = Arg.create(Amount.of(1L, Time.MINUTES)); + @CmdLine(name = "sla_prod_metrics", + help = "Metric categories collected for production tasks.") + private static final Arg<Set<MetricCategory>> SLA_PROD_METRICS = + Arg.<Set<MetricCategory>>create(ImmutableSet.of(JOB_UPTIMES, PLATFORM_UPTIME, MEDIANS)); + + @CmdLine(name = "sla_non_prod_metrics", + help = "Metric categories collected for non production tasks.") + private static final Arg<Set<MetricCategory>> SLA_NON_PROD_METRICS = + Arg.<Set<MetricCategory>>create(ImmutableSet.of()); + @VisibleForTesting @Qualifier @Target({ FIELD, PARAMETER, METHOD }) @Retention(RUNTIME) @interface SlaExecutor { } private final Amount<Long, Time> refreshInterval; + private final Set<MetricCategory> prodMetrics; + private final Set<MetricCategory> nonProdMetrics; @VisibleForTesting - SlaModule(Amount<Long, Time> refreshInterval) { + SlaModule( + Amount<Long, Time> refreshInterval, + Set<MetricCategory> prodMetrics, + Set<MetricCategory> nonProdMetrics) { + this.refreshInterval = refreshInterval; + this.prodMetrics = prodMetrics; + this.nonProdMetrics = nonProdMetrics; } public SlaModule() { - this(SLA_REFRESH_INTERVAL.get()); + this(SLA_REFRESH_INTERVAL.get(), SLA_PROD_METRICS.get(), SLA_NON_PROD_METRICS.get()); } @Override protected void configure() { bind(MetricCalculatorSettings.class) - .toInstance(new MetricCalculatorSettings(refreshInterval.as(Time.MILLISECONDS))); + .toInstance(new MetricCalculatorSettings( + refreshInterval.as(Time.MILLISECONDS), + prodMetrics, + nonProdMetrics)); bind(MetricCalculator.class).in(Singleton.class); bind(ScheduledExecutorService.class) http://git-wip-us.apache.org/repos/asf/aurora/blob/318b40d7/src/test/java/org/apache/aurora/scheduler/sla/MetricCalculatorTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/aurora/scheduler/sla/MetricCalculatorTest.java b/src/test/java/org/apache/aurora/scheduler/sla/MetricCalculatorTest.java index d9c7b07..8e552e9 100644 --- a/src/test/java/org/apache/aurora/scheduler/sla/MetricCalculatorTest.java +++ b/src/test/java/org/apache/aurora/scheduler/sla/MetricCalculatorTest.java @@ -17,6 +17,7 @@ import java.util.Map; import java.util.Set; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; import com.twitter.common.quantity.Amount; @@ -27,6 +28,7 @@ import com.twitter.common.util.testing.FakeClock; import org.apache.aurora.scheduler.base.Query; import org.apache.aurora.scheduler.sla.MetricCalculator.MetricCalculatorSettings; +import org.apache.aurora.scheduler.sla.SlaGroup.GroupType; import org.apache.aurora.scheduler.storage.entities.IScheduledTask; import org.apache.aurora.scheduler.storage.testing.StorageTestUtil; import org.easymock.Capture; @@ -35,8 +37,11 @@ import org.easymock.EasyMock; import org.junit.Test; import static org.apache.aurora.gen.ScheduleStatus.PENDING; -import static org.apache.aurora.scheduler.sla.MetricCalculator.NON_PROD_METRICS; -import static org.apache.aurora.scheduler.sla.MetricCalculator.PROD_METRICS; +import static org.apache.aurora.scheduler.sla.MetricCalculator.MetricCategory.JOB_UPTIMES; +import static org.apache.aurora.scheduler.sla.MetricCalculator.MetricCategory.MEDIANS; +import static org.apache.aurora.scheduler.sla.MetricCalculator.MetricCategory.PLATFORM_UPTIME; +import static org.apache.aurora.scheduler.sla.MetricCalculator.NAME_QUALIFIER_NON_PROD; +import static org.apache.aurora.scheduler.sla.MetricCalculator.NAME_QUALIFIER_PROD; import static org.apache.aurora.scheduler.sla.SlaAlgorithm.AlgorithmType; import static org.apache.aurora.scheduler.sla.SlaTestUtil.makeTask; import static org.easymock.EasyMock.expect; @@ -44,12 +49,29 @@ import static org.junit.Assert.assertEquals; public class MetricCalculatorTest extends EasyMockTest { + static final Multimap<AlgorithmType, GroupType> PROD_METRICS = + ImmutableMultimap.<AlgorithmType, GroupType>builder() + .putAll(JOB_UPTIMES.getMetrics()) + .putAll(MEDIANS.getMetrics()) + .putAll(PLATFORM_UPTIME.getMetrics()) + .build(); + + static final Multimap<AlgorithmType, GroupType> NON_PROD_METRICS = + ImmutableMultimap.<AlgorithmType, GroupType>builder() + .putAll(JOB_UPTIMES.getMetrics()) + .putAll(MEDIANS.getMetrics()) + .putAll(PLATFORM_UPTIME.getMetrics()) + .build(); + @Test public void runTest() { FakeClock clock = new FakeClock(); StatsProvider statsProvider = createMock(StatsProvider.class); StatsProvider untracked = createMock(StatsProvider.class); - MetricCalculatorSettings settings = new MetricCalculatorSettings(10000); + MetricCalculatorSettings settings = new MetricCalculatorSettings( + 10000, + ImmutableSet.of(JOB_UPTIMES, MEDIANS, PLATFORM_UPTIME), + ImmutableSet.of(JOB_UPTIMES, MEDIANS, PLATFORM_UPTIME)); StorageTestUtil storageUtil = new StorageTestUtil(this); MetricCalculator calculator = new MetricCalculator( storageUtil.storage, @@ -92,7 +114,8 @@ public class MetricCalculatorTest extends EasyMockTest { for (Multimap<AlgorithmType, SlaGroup.GroupType> definition : definitions) { for (Map.Entry<AlgorithmType, SlaGroup.GroupType> entry : definition.entries()) { for (String metric : entry.getValue().getSlaGroup().createNamedGroups(tasks).keys()) { - names.add(metric + entry.getKey().getAlgorithmName()); + names.add(metric + entry.getKey().getAlgorithmName() + NAME_QUALIFIER_PROD); + names.add(metric + entry.getKey().getAlgorithmName() + NAME_QUALIFIER_NON_PROD); } } } http://git-wip-us.apache.org/repos/asf/aurora/blob/318b40d7/src/test/java/org/apache/aurora/scheduler/sla/SlaModuleTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/aurora/scheduler/sla/SlaModuleTest.java b/src/test/java/org/apache/aurora/scheduler/sla/SlaModuleTest.java index 763a6c4..300ab2e 100644 --- a/src/test/java/org/apache/aurora/scheduler/sla/SlaModuleTest.java +++ b/src/test/java/org/apache/aurora/scheduler/sla/SlaModuleTest.java @@ -19,6 +19,7 @@ import java.util.concurrent.ScheduledThreadPoolExecutor; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; import com.google.inject.AbstractModule; import com.google.inject.Guice; import com.google.inject.Injector; @@ -44,6 +45,9 @@ import org.junit.Before; import org.junit.Test; import static org.apache.aurora.gen.ScheduleStatus.PENDING; +import static org.apache.aurora.scheduler.sla.MetricCalculator.MetricCategory.JOB_UPTIMES; +import static org.apache.aurora.scheduler.sla.MetricCalculator.MetricCategory.MEDIANS; +import static org.apache.aurora.scheduler.sla.MetricCalculator.MetricCategory.PLATFORM_UPTIME; import static org.easymock.EasyMock.expect; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -61,7 +65,10 @@ public class SlaModuleTest extends EasyMockTest { storageUtil = new StorageTestUtil(this); clock = new FakeClock(); statsProvider = createMock(StatsProvider.class); - module = new SlaModule(Amount.of(5L, Time.MILLISECONDS)); + module = new SlaModule( + Amount.of(5L, Time.MILLISECONDS), + ImmutableSet.of(JOB_UPTIMES, MEDIANS, PLATFORM_UPTIME), + ImmutableSet.of(JOB_UPTIMES, MEDIANS, PLATFORM_UPTIME)); injector = Guice.createInjector( ImmutableList.<Module>builder() .add(module)
