Repository: aurora Updated Branches: refs/heads/master b76e38f9a -> edd47d08f
Adding support for GPU resource Reviewed at https://reviews.apache.org/r/47869/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/edd47d08 Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/edd47d08 Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/edd47d08 Branch: refs/heads/master Commit: edd47d08f57a816d524f4a353a1cb619ef093b89 Parents: b76e38f Author: Maxim Khutornenko <[email protected]> Authored: Mon May 30 18:43:47 2016 -0700 Committer: Maxim Khutornenko <[email protected]> Committed: Mon May 30 18:43:47 2016 -0700 ---------------------------------------------------------------------- RELEASE-NOTES.md | 17 ++++++++ .../thrift/org/apache/aurora/gen/api.thrift | 11 +++++- docs/features/resource-isolation.md | 12 ++++++ docs/reference/configuration.md | 1 + .../mesos_config/etc_mesos-slave/resources | 2 +- examples/vagrant/upstart/aurora-scheduler.conf | 3 +- .../apache/aurora/scheduler/app/AppModule.java | 6 ++- .../aurora/scheduler/base/TaskTestUtil.java | 1 + .../configuration/ConfigurationManager.java | 14 ++++++- .../scheduler/resources/ResourceType.java | 15 +++++++ src/main/python/apache/aurora/config/thrift.py | 8 +++- .../python/apache/thermos/config/schema_base.py | 1 + .../apache/thermos/config/schema_helpers.py | 17 +++++--- .../scheduler/assets/configSummary.html | 15 ++++--- .../resources/scheduler/assets/js/filters.js | 39 ++++++++++++++----- .../configuration/ConfigurationManagerTest.java | 35 +++++++++++++---- .../resources/ResourceManagerTest.java | 2 + .../aurora/scheduler/thrift/ThriftIT.java | 3 +- .../python/apache/aurora/config/test_thrift.py | 3 +- .../python/apache/thermos/config/test_schema.py | 33 ++++++++++------ .../apache/aurora/e2e/http/http_example.aurora | 41 ++++++++++++-------- .../http/http_example_bad_healthcheck.aurora | 41 ++++++++++++-------- .../aurora/e2e/http/http_example_updated.aurora | 41 ++++++++++++-------- .../sh/org/apache/aurora/e2e/test_end_to_end.sh | 11 ++++-- 24 files changed, 269 insertions(+), 103 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/RELEASE-NOTES.md ---------------------------------------------------------------------- diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 6e878c5..21a141f 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -24,6 +24,21 @@ reference to either a `Docker` or `Mesos` container. - New scheduler command line argument `-ip` to control what ip address to bind the schedulers http server to. +- Added experimental support for Mesos GPU resource. This feature will be available in Mesos 0.29.0 + and is disabled by default. Use `-allow_gpu_resource` flag to enable it. + + **IMPORTANT: once this feature is enabled, creating jobs with GPU resource will make scheduler + snapshot backwards incompatible. Scheduler will be unable to read snapshot if rolled back to + previous version. If rollback is absolutely necessary, perform the following steps:** + 1. Set `-allow_gpu_resource` to false + 2. Delete all jobs with GPU resource (including cron job schedules if applicable) + 3. Wait until GPU task history is pruned. You may speed it up by changing the history retention + flags, e.g.: `-history_prune_threshold=1mins` and `-history_max_per_job_threshold=0` + 4. In case there were GPU job updates created, prune job update history for affected jobs from + `/h2console` endpoint or reduce job update pruning thresholds, e.g.: + `-job_update_history_pruning_threshold=1mins` and `-job_update_history_per_job_threshold=0` + 5. Ensure a new snapshot is created by running `aurora_admin scheduler_snapshot <cluster>` + 6. Rollback to previous version ### Deprecations and removals: @@ -37,6 +52,8 @@ sandbox. - Setting the `container` property of a `Job` to a `Container` holder is deprecated in favor of setting it directly to the appropriate (i.e. `Docker` or `Mesos`) container type. +- Deprecated `numCpus`, `ramMb` and `diskMb` fields in `TaskConfig` and `ResourceAggregate` thrift + structs. Use `set<Resource> resources` to specify task resources or quota values. 0.13.0 ------ http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/api/src/main/thrift/org/apache/aurora/gen/api.thrift ---------------------------------------------------------------------- diff --git a/api/src/main/thrift/org/apache/aurora/gen/api.thrift b/api/src/main/thrift/org/apache/aurora/gen/api.thrift index a99889c..ed94f24 100644 --- a/api/src/main/thrift/org/apache/aurora/gen/api.thrift +++ b/api/src/main/thrift/org/apache/aurora/gen/api.thrift @@ -223,18 +223,22 @@ union Resource { 2: i64 ramMb 3: i64 diskMb 4: string namedPort + 5: i64 numGpus } /** Description of the tasks contained within a job. */ struct TaskConfig { /** Job task belongs to. */ 28: JobKey job - // TODO(maxim): Remove in 0.7.0. (AURORA-749) + // TODO(maxim): Deprecated. See AURORA-749. /** contains the role component of JobKey */ 17: Identity owner 7: bool isService + // TODO(maxim): Deprecated. See AURORA-1707. 8: double numCpus + // TODO(maxim): Deprecated. See AURORA-1707. 9: i64 ramMb + // TODO(maxim): Deprecated. See AURORA-1707. 10: i64 diskMb 11: i32 priority 13: i32 maxTaskFailures @@ -267,10 +271,13 @@ struct TaskConfig { } struct ResourceAggregate { + // TODO(maxim): Deprecated. See AURORA-1707. /** Number of CPU cores allotted. */ 1: double numCpus + // TODO(maxim): Deprecated. See AURORA-1707. /** Megabytes of RAM allotted. */ 2: i64 ramMb + // TODO(maxim): Deprecated. See AURORA-1707. /** Megabytes of disk space allotted. */ 3: i64 diskMb /** Aggregated resource values. */ @@ -299,7 +306,7 @@ struct JobConfiguration { * used to construct it server-side. */ 9: JobKey key - // TODO(maxim): Remove in 0.7.0. (AURORA-749) + // TODO(maxim): Deprecated. See AURORA-749. /** Owner of this job. */ 7: Identity owner /** http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/docs/features/resource-isolation.md ---------------------------------------------------------------------- diff --git a/docs/features/resource-isolation.md b/docs/features/resource-isolation.md index d08b59f..59da823 100644 --- a/docs/features/resource-isolation.md +++ b/docs/features/resource-isolation.md @@ -101,6 +101,14 @@ are still available but you shouldn't count on them being so. application can write above its quota without getting an ENOSPC, but it will be killed shortly after. This is subject to change. +### GPU Isolation + +GPU isolation will be supported for Nvidia devices starting from Mesos 0.29.0. +Access to the allocated units will be exclusive with no sharing between tasks +allowed (e.g. no fractional GPU allocation). Until official documentation is released, +see [Mesos design document](https://docs.google.com/document/d/10GJ1A80x4nIEo8kfdeo9B11PIbS1xJrrB4Z373Ifkpo/edit#heading=h.w84lz7p4eexl) +for more details. + ### Other Resources Other resources, such as network bandwidth, do not have any performance @@ -141,6 +149,10 @@ add the maximum size of the Java heap to your disk space requirement, in order to account for an out of memory error dumping the heap into the application's sandbox space. +## GPU Sizing + +GPU is highly dependent on your application requirements and is only limited +by the number of physical GPU units available on a target box. Oversubscription ---------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/docs/reference/configuration.md ---------------------------------------------------------------------- diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index e77ee60..c56ace0 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -321,6 +321,7 @@ resources are allocated. ```cpu``` | Float | Fractional number of cores required by the task. ```ram``` | Integer | Bytes of RAM required by the task. ```disk``` | Integer | Bytes of disk required by the task. + ```gpu``` | Integer | Number of GPU cores required by the task Job Schema http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/examples/vagrant/mesos_config/etc_mesos-slave/resources ---------------------------------------------------------------------- diff --git a/examples/vagrant/mesos_config/etc_mesos-slave/resources b/examples/vagrant/mesos_config/etc_mesos-slave/resources index 5bfe779..aa0e97e 100644 --- a/examples/vagrant/mesos_config/etc_mesos-slave/resources +++ b/examples/vagrant/mesos_config/etc_mesos-slave/resources @@ -1 +1 @@ -cpus(aurora-role):0.5;cpus(*):3.5;mem(aurora-role):1024;disk:20000 +cpus(aurora-role):0.5;cpus(*):3.5;mem(aurora-role):1024;disk:20000;gpus(*):4 http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/examples/vagrant/upstart/aurora-scheduler.conf ---------------------------------------------------------------------- diff --git a/examples/vagrant/upstart/aurora-scheduler.conf b/examples/vagrant/upstart/aurora-scheduler.conf index 3d9e706..954ddb4 100644 --- a/examples/vagrant/upstart/aurora-scheduler.conf +++ b/examples/vagrant/upstart/aurora-scheduler.conf @@ -51,4 +51,5 @@ exec bin/aurora-scheduler \ -tier_config=/home/vagrant/aurora/src/main/resources/org/apache/aurora/scheduler/tiers.json \ -mesos_role=aurora-role \ -populate_discovery_info=true \ - -receive_revocable_resources=true + -receive_revocable_resources=true \ + -allow_gpu_resource=true http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/java/org/apache/aurora/scheduler/app/AppModule.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/app/AppModule.java b/src/main/java/org/apache/aurora/scheduler/app/AppModule.java index c81bd79..6c7c75a 100644 --- a/src/main/java/org/apache/aurora/scheduler/app/AppModule.java +++ b/src/main/java/org/apache/aurora/scheduler/app/AppModule.java @@ -94,6 +94,9 @@ public class AppModule extends AbstractModule { help = "If false, Docker tasks may run without an executor (EXPERIMENTAL)") private static final Arg<Boolean> REQUIRE_DOCKER_USE_EXECUTOR = Arg.create(true); + @CmdLine(name = "allow_gpu_resource", help = "Allow jobs to request Mesos GPU resource.") + private static final Arg<Boolean> ALLOW_GPU_RESOURCE = Arg.create(false); + private final ConfigurationManagerSettings configurationManagerSettings; @VisibleForTesting @@ -106,7 +109,8 @@ public class AppModule extends AbstractModule { ImmutableSet.copyOf(ALLOWED_CONTAINER_TYPES.get()), ENABLE_DOCKER_PARAMETERS.get(), DEFAULT_DOCKER_PARAMETERS.get(), - REQUIRE_DOCKER_USE_EXECUTOR.get())); + REQUIRE_DOCKER_USE_EXECUTOR.get(), + ALLOW_GPU_RESOURCE.get())); } @Override http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/java/org/apache/aurora/scheduler/base/TaskTestUtil.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/base/TaskTestUtil.java b/src/main/java/org/apache/aurora/scheduler/base/TaskTestUtil.java index 221417f..e431b58 100644 --- a/src/main/java/org/apache/aurora/scheduler/base/TaskTestUtil.java +++ b/src/main/java/org/apache/aurora/scheduler/base/TaskTestUtil.java @@ -68,6 +68,7 @@ public final class TaskTestUtil { ImmutableSet.of(_Fields.MESOS), false, ImmutableMultimap.of(), + true, true); public static final ConfigurationManager CONFIGURATION_MANAGER = new ConfigurationManager(CONFIGURATION_MANAGER_SETTINGS, DEV_TIER_MANAGER); http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/java/org/apache/aurora/scheduler/configuration/ConfigurationManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/configuration/ConfigurationManager.java b/src/main/java/org/apache/aurora/scheduler/configuration/ConfigurationManager.java index 4ef202c..0e95620 100644 --- a/src/main/java/org/apache/aurora/scheduler/configuration/ConfigurationManager.java +++ b/src/main/java/org/apache/aurora/scheduler/configuration/ConfigurationManager.java @@ -51,6 +51,7 @@ import org.apache.aurora.scheduler.storage.log.ThriftBackfill; import static java.util.Objects.requireNonNull; +import static org.apache.aurora.scheduler.resources.ResourceType.GPUS; import static org.apache.aurora.scheduler.resources.ResourceType.PORTS; /** @@ -114,17 +115,20 @@ public class ConfigurationManager { private final boolean allowDockerParameters; private final Multimap<String, String> defaultDockerParameters; private final boolean requireDockerUseExecutor; + private final boolean allowGpuResource; public ConfigurationManagerSettings( ImmutableSet<Container._Fields> allowedContainerTypes, boolean allowDockerParameters, Multimap<String, String> defaultDockerParameters, - boolean requireDockerUseExecutor) { + boolean requireDockerUseExecutor, + boolean allowGpuResource) { this.allowedContainerTypes = requireNonNull(allowedContainerTypes); this.allowDockerParameters = allowDockerParameters; this.defaultDockerParameters = requireNonNull(defaultDockerParameters); this.requireDockerUseExecutor = requireDockerUseExecutor; + this.allowGpuResource = allowGpuResource; } } @@ -321,6 +325,14 @@ public class ConfigurationManager { throw new TaskDescriptionException("Multiple resource values are not supported for " + types); } + if (!settings.allowGpuResource && config.getResources().stream() + .filter(r -> ResourceType.fromResource(r).equals(GPUS)) + .findAny() + .isPresent()) { + + throw new TaskDescriptionException("GPU resource support is disabled in this cluster."); + } + maybeFillLinks(builder); return ITaskConfig.build(builder); http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/java/org/apache/aurora/scheduler/resources/ResourceType.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/aurora/scheduler/resources/ResourceType.java b/src/main/java/org/apache/aurora/scheduler/resources/ResourceType.java index 6a4f110..bfe7583 100644 --- a/src/main/java/org/apache/aurora/scheduler/resources/ResourceType.java +++ b/src/main/java/org/apache/aurora/scheduler/resources/ResourceType.java @@ -100,6 +100,21 @@ public enum ResourceType implements TEnum { "count", 1000, true, + false), + + /** + * GPU resource. + */ + GPUS( + _Fields.NUM_GPUS, + SCALAR, + "gpus", + LONG, + Optional.empty(), + "GPU", + "core(s)", + 4, + false, false); /** http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/python/apache/aurora/config/thrift.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/config/thrift.py b/src/main/python/apache/aurora/config/thrift.py index 81a5055..3539469 100644 --- a/src/main/python/apache/aurora/config/thrift.py +++ b/src/main/python/apache/aurora/config/thrift.py @@ -268,10 +268,14 @@ def convert(job, metadata=frozenset(), ports=frozenset()): if task.numCpus <= 0 or task.ramMb <= 0 or task.diskMb <= 0: raise InvalidConfig('Task has invalid resources. cpu/ramMb/diskMb must all be positive: ' 'cpu:%r ramMb:%r diskMb:%r' % (task.numCpus, task.ramMb, task.diskMb)) + numGpus = fully_interpolated(task_raw.resources().gpu()) task.resources = frozenset( - [Resource(numCpus=task.numCpus), Resource(ramMb=task.ramMb), Resource(diskMb=task.diskMb)] + - [Resource(namedPort=p) for p in ports]) + [Resource(numCpus=task.numCpus), + Resource(ramMb=task.ramMb), + Resource(diskMb=task.diskMb)] + + [Resource(namedPort=p) for p in ports] + + [Resource(numGpus=numGpus)] if numGpus else []) task.job = key task.owner = owner http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/python/apache/thermos/config/schema_base.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/thermos/config/schema_base.py b/src/main/python/apache/thermos/config/schema_base.py index a6768e6..f9e96d9 100644 --- a/src/main/python/apache/thermos/config/schema_base.py +++ b/src/main/python/apache/thermos/config/schema_base.py @@ -51,6 +51,7 @@ class Resources(Struct): cpu = Required(Float) ram = Required(Integer) disk = Required(Integer) + gpu = Default(Integer, 0) class Constraint(Struct): http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/python/apache/thermos/config/schema_helpers.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/thermos/config/schema_helpers.py b/src/main/python/apache/thermos/config/schema_helpers.py index 46394bb..cecfd80 100644 --- a/src/main/python/apache/thermos/config/schema_helpers.py +++ b/src/main/python/apache/thermos/config/schema_helpers.py @@ -69,9 +69,13 @@ class Units(object): def add(r1, r2): return Resources(cpu=add_unit(r1.cpu(), r2.cpu()), ram=add_unit(r1.ram(), r2.ram()), - disk=add_unit(r1.disk(), r2.disk())) + disk=add_unit(r1.disk(), r2.disk()), + gpu=add_unit(r1.gpu(), r2.gpu())) - return reduce(add, map(cls.optional_resources, resources), Resources(cpu=0, ram=0, disk=0)) + return reduce( + add, + map(cls.optional_resources, resources), + Resources(cpu=0, ram=0, disk=0, gpu=0)) @classmethod def finalization_wait_sum(cls, waits): @@ -88,10 +92,13 @@ class Units(object): def resource_max(r1, r2): return Resources(cpu=max_unit(r1.cpu(), r2.cpu()), ram=max_unit(r1.ram(), r2.ram()), - disk=max_unit(r1.disk(), r2.disk())) + disk=max_unit(r1.disk(), r2.disk()), + gpu=max_unit(r1.gpu(), r2.gpu())) - return reduce(resource_max, - map(cls.optional_resources, resources), Resources(cpu=0, ram=0, disk=0)) + return reduce( + resource_max, + map(cls.optional_resources, resources), + Resources(cpu=0, ram=0, disk=0, gpu=0)) @classmethod def finalization_wait_max(cls, waits): http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/resources/scheduler/assets/configSummary.html ---------------------------------------------------------------------- diff --git a/src/main/resources/scheduler/assets/configSummary.html b/src/main/resources/scheduler/assets/configSummary.html index 36df616..86a87ab 100644 --- a/src/main/resources/scheduler/assets/configSummary.html +++ b/src/main/resources/scheduler/assets/configSummary.html @@ -17,10 +17,15 @@ configuration details for instances {{group.label}} </th> <tr> - <td class="cellLabel" rowspan="3">resources</td> + <td class="cellLabel" + rowspan="{{group.summary.schedulingDetail.resources | toResourceValue}}">resources</td> <td>cpu</td> <td>{{group.summary.schedulingDetail.resources | toResourceValue:'CPUS'}}</td> </tr> + <tr ng-if='(group.summary.schedulingDetail.resources | toResourceValue:"GPUS").length > 0'> + <td>gpu</td> + <td>{{group.summary.schedulingDetail.resources | toResourceValue:'GPUS'}}</td> + </tr> <tr> <td>ram</td> <td>{{group.summary.schedulingDetail.resources | toResourceValue:'RAM_MB'}}</td> @@ -29,6 +34,10 @@ <td>disk</td> <td>{{group.summary.schedulingDetail.resources | toResourceValue:'DISK_MB'}}</td> </tr> + <tr ng-if='(group.summary.schedulingDetail.resources | toResourceValue:"PORTS").length > 0'> + <td>ports</td> + <td>{{group.summary.schedulingDetail.resources | toResourceValue:'PORTS'}}</td> + </tr> <tr> <td class="cellLabel">constraints</td> <td colspan="2">{{group.summary.schedulingDetail.constraints}}</td> @@ -41,10 +50,6 @@ <td class="cellLabel">service</td> <td colspan="2">{{group.summary.schedulingDetail.isService}}</td> </tr> - <tr ng-if='(group.summary.schedulingDetail.resources | toResourceValue:"PORTS").length > 0'> - <td class="cellLabel">ports</td> - <td colspan="2">{{group.summary.schedulingDetail.resources | toResourceValue:'PORTS'}}</td> - </tr> <tr ng-if='group.summary.schedulingDetail.metadata'> <td class="cellLabel">metadata</td> <td colspan="2">{{group.summary.schedulingDetail.metadata}}</td> http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/resources/scheduler/assets/js/filters.js ---------------------------------------------------------------------- diff --git a/src/main/resources/scheduler/assets/js/filters.js b/src/main/resources/scheduler/assets/js/filters.js index 98f786e..ec35d81 100644 --- a/src/main/resources/scheduler/assets/js/filters.js +++ b/src/main/resources/scheduler/assets/js/filters.js @@ -95,31 +95,50 @@ return function (resources, type) { var RESOURCE_MAP = { 'CPUS': { - filter: function (e) { return e.numCpus !== null; }, - format: function (v) { return _.first(v).numCpus + ' cores'; } + field: 'numCpus', + format: function (v) { return _.first(v)[this.field] + ' core(s)'; } }, 'RAM_MB': { - filter: function (e) { return e.ramMb !== null; }, - format: function (v) { return formatMem(_.first(v).ramMb); } + field: 'ramMb', + format: function (v) { return formatMem(_.first(v)[this.field]); } }, 'DISK_MB': { - filter: function (e) { return e.diskMb !== null; }, - format: function (v) { return formatMem(_.first(v).diskMb); } + field: 'diskMb', + format: function (v) { return formatMem(_.first(v)[this.field]); } }, 'PORTS': { - filter: function (e) { return e.namedPort !== null; }, + field: 'namedPort', format: function (v) { + var field = this.field; return _.chain(v) - .map(function (r) { return r.namedPort; }) + .map(function (r) { return r[field]; }) .sortBy() .value() .join(', '); } + }, + 'GPUS': { + field: 'numGpus', + format: function (v) { return _.first(v)[this.field] + ' core(s)'; } } }; - if (RESOURCE_MAP.hasOwnProperty(type)) { - var match = _.filter(resources, RESOURCE_MAP[type].filter); + if (!type) { + return _.chain(resources) + .groupBy(function (r) { + for (var key in RESOURCE_MAP) { + var field = RESOURCE_MAP[key].field; + if (r.hasOwnProperty(field) && r[field] !== null) { + return field; + } + } + return null; + }) + .size() + .value(); + } else if (RESOURCE_MAP.hasOwnProperty(type)) { + var field = RESOURCE_MAP[type].field; + var match = _.filter(resources, function (r) { return r[field] !== null; }); if (match && !_.isEmpty(match)) { return RESOURCE_MAP[type].format(match); } http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/java/org/apache/aurora/scheduler/configuration/ConfigurationManagerTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/aurora/scheduler/configuration/ConfigurationManagerTest.java b/src/test/java/org/apache/aurora/scheduler/configuration/ConfigurationManagerTest.java index ddf8143..2e322d2 100644 --- a/src/test/java/org/apache/aurora/scheduler/configuration/ConfigurationManagerTest.java +++ b/src/test/java/org/apache/aurora/scheduler/configuration/ConfigurationManagerTest.java @@ -45,6 +45,7 @@ import org.junit.rules.ExpectedException; import static org.apache.aurora.gen.Resource.diskMb; import static org.apache.aurora.gen.Resource.namedPort; import static org.apache.aurora.gen.Resource.numCpus; +import static org.apache.aurora.gen.Resource.numGpus; import static org.apache.aurora.gen.Resource.ramMb; import static org.apache.aurora.gen.test.testConstants.INVALID_IDENTIFIERS; import static org.apache.aurora.gen.test.testConstants.VALID_IDENTIFIERS; @@ -113,17 +114,19 @@ public class ConfigurationManagerTest { private static final ConfigurationManager CONFIGURATION_MANAGER = new ConfigurationManager( new ConfigurationManagerSettings( - ALL_CONTAINER_TYPES, - false, - ImmutableMultimap.of(), - true), + ALL_CONTAINER_TYPES, + false, + ImmutableMultimap.of(), + true, + false), TaskTestUtil.DEV_TIER_MANAGER); private static final ConfigurationManager DOCKER_CONFIGURATION_MANAGER = new ConfigurationManager( new ConfigurationManagerSettings( - ALL_CONTAINER_TYPES, - true, - ImmutableMultimap.of("foo", "bar"), - false), + ALL_CONTAINER_TYPES, + true, + ImmutableMultimap.of("foo", "bar"), + false, + true), TaskTestUtil.DEV_TIER_MANAGER); @Test @@ -272,6 +275,22 @@ public class ConfigurationManagerTest { } @Test + public void testGpuResourcesNotAllowed() throws Exception { + TaskConfig builder = CONFIG_WITH_CONTAINER.newBuilder(); + builder.addToResources(numGpus(2)); + + expectTaskDescriptionException("GPU resource support is disabled in this cluster."); + new ConfigurationManager( + new ConfigurationManagerSettings( + ALL_CONTAINER_TYPES, + true, + ImmutableMultimap.of("foo", "bar"), + false, + false), + TaskTestUtil.DEV_TIER_MANAGER).validateAndPopulate(ITaskConfig.build(builder)); + } + + @Test public void testTaskLinks() throws Exception { TaskConfig builder = CONFIG_WITH_CONTAINER.newBuilder(); builder.addToResources(namedPort("health")); http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/java/org/apache/aurora/scheduler/resources/ResourceManagerTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/aurora/scheduler/resources/ResourceManagerTest.java b/src/test/java/org/apache/aurora/scheduler/resources/ResourceManagerTest.java index a5dda25..14ac547 100644 --- a/src/test/java/org/apache/aurora/scheduler/resources/ResourceManagerTest.java +++ b/src/test/java/org/apache/aurora/scheduler/resources/ResourceManagerTest.java @@ -32,6 +32,7 @@ import org.junit.Test; import static org.apache.aurora.gen.Resource.diskMb; import static org.apache.aurora.gen.Resource.namedPort; import static org.apache.aurora.gen.Resource.numCpus; +import static org.apache.aurora.gen.Resource.numGpus; import static org.apache.aurora.gen.Resource.ramMb; import static org.apache.aurora.scheduler.base.TaskTestUtil.JOB; import static org.apache.aurora.scheduler.base.TaskTestUtil.makeTask; @@ -114,6 +115,7 @@ public class ResourceManagerTest { public void testGetTaskResourceTypes() { AssignedTask builder = makeTask("id", JOB).newBuilder().getAssignedTask(); builder.getTask().addToResources(namedPort("health")); + builder.getTask().addToResources(numGpus(4)); assertEquals( EnumSet.allOf(ResourceType.class), http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/java/org/apache/aurora/scheduler/thrift/ThriftIT.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/aurora/scheduler/thrift/ThriftIT.java b/src/test/java/org/apache/aurora/scheduler/thrift/ThriftIT.java index 9cce641..a54d169 100644 --- a/src/test/java/org/apache/aurora/scheduler/thrift/ThriftIT.java +++ b/src/test/java/org/apache/aurora/scheduler/thrift/ThriftIT.java @@ -154,7 +154,8 @@ public class ThriftIT extends EasyMockTest { ImmutableSet.of(_Fields.DOCKER), true, ImmutableMultimap.of(), - false); + false, + true); createThrift(configurationManagerSettings); http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/python/apache/aurora/config/test_thrift.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/config/test_thrift.py b/src/test/python/apache/aurora/config/test_thrift.py index 8769db3..e213184 100644 --- a/src/test/python/apache/aurora/config/test_thrift.py +++ b/src/test/python/apache/aurora/config/test_thrift.py @@ -45,7 +45,7 @@ HELLO_WORLD = Job( task=Task( name='main', processes=[Process(name='hello_world', cmdline='echo {{mesos.instance}}')], - resources=Resources(cpu=0.1, ram=64 * 1048576, disk=64 * 1048576), + resources=Resources(cpu=0.1, ram=64 * 1048576, disk=64 * 1048576, gpu=2), ) ) @@ -77,6 +77,7 @@ def test_simple_config(): assert Resource(ramMb=64) in list(tti.resources) assert Resource(diskMb=64) in list(tti.resources) assert Resource(namedPort='health') in list(tti.resources) + assert Resource(numGpus=2) in list(tti.resources) def test_config_with_tier(): http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/python/apache/thermos/config/test_schema.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/thermos/config/test_schema.py b/src/test/python/apache/thermos/config/test_schema.py index 0440ee5..4cae2de 100644 --- a/src/test/python/apache/thermos/config/test_schema.py +++ b/src/test/python/apache/thermos/config/test_schema.py @@ -51,17 +51,28 @@ def test_order(): def test_add_resources(): - assert Units.resources_sum(Resources(), Resources()) == Resources(cpu=0, ram=0, disk=0) - - r100 = Resources(cpu=1, ram=0, disk=0) - r010 = Resources(cpu=0, ram=1, disk=0) - r001 = Resources(cpu=0, ram=0, disk=1) - r111 = Resources(cpu=1, ram=1, disk=1) - r222 = Resources(cpu=2, ram=2, disk=2) - - assert reduce(Units.resources_sum, [r100, r010, r001]) == r111 - assert Units.resources_sum(r111, r111) == r222 - assert r222 == Units.resources_sum(r100, r010, r001, r111, Resources()) + assert Units.resources_sum(Resources(), Resources()) == Resources(cpu=0, ram=0, disk=0, gpu=0) + + r1000 = Resources(cpu=1, ram=0, disk=0, gpu=0) + r1001 = Resources(cpu=1, ram=0, disk=0, gpu=1) + r0100 = Resources(cpu=0, ram=1, disk=0, gpu=0) + r0010 = Resources(cpu=0, ram=0, disk=1, gpu=0) + r1110 = Resources(cpu=1, ram=1, disk=1, gpu=0) + r1101 = Resources(cpu=1, ram=1, disk=0, gpu=1) + r2220 = Resources(cpu=2, ram=2, disk=2, gpu=0) + + assert reduce(Units.resources_sum, [r1000, r0100, r0010]) == r1110 + assert Units.resources_sum(r1110, r1110) == r2220 + assert r2220 == Units.resources_sum(r1000, r0100, r0010, r1110, Resources()) + assert Units.resources_sum(r1001, r0100) == r1101 + + +def test_max_resources(): + assert Resources(cpu=1, ram=2, disk=3, gpu=4) == Units.resources_max([ + Resources(cpu=0, ram=2, disk=1, gpu=4), + Resources(cpu=1, ram=1, disk=2, gpu=0), + Resources(cpu=0, ram=1, disk=3, gpu=1) + ]) def test_combine_tasks(): http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora ---------------------------------------------------------------------- diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora b/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora index 219c40f..bf6ef69 100644 --- a/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora +++ b/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora @@ -14,7 +14,19 @@ import getpass -DEFAULT_CMD = 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .' +class DefaultProfile(Struct): + role=Default(String, getpass.getuser()) + cmd=Default(String, 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .') + gpu=Default(Integer, 0) + + +ContainerProfile = DefaultProfile( + cmd = 'cp /tmp/http_example.py .' +) + +GpuProfile = DefaultProfile( + gpu=1 +) echo_ports = Process( name = 'echo_ports', @@ -27,12 +39,12 @@ run_server = Process( stage_server = Process( name = 'stage_server', - cmdline = '{{cmd}}' + cmdline = '{{profile.cmd}}' ) test_task = Task( name = 'http_example', - resources = Resources(cpu=0.4, ram=32*MB, disk=64*MB), + resources = Resources(cpu=0.5, ram=32*MB, disk=64*MB, gpu='{{profile.gpu}}'), processes = [echo_ports, stage_server, run_server], constraints = order(echo_ports, stage_server, run_server)) @@ -45,9 +57,9 @@ job = Service( update_config = update_config, health_check_config = health_check_config, task = test_task, - role = getpass.getuser(), + role = '{{profile.role}}', environment = 'test', - contact = '{{role}}@localhost', + contact = '{{profile.role}}@localhost', announce = Announcer( portmap={'alias': 'http'}, ), @@ -56,25 +68,20 @@ job = Service( jobs = [ job( name = 'http_example' - ).bind( - cmd = DEFAULT_CMD - ), + ).bind(profile=DefaultProfile()), job( name = 'http_example_revocable', tier = 'revocable' - ).bind( - cmd = DEFAULT_CMD - ), + ).bind(profile=DefaultProfile()), job( name = 'http_example_docker', container = Container(docker=Docker(image = 'http_example')) - ).bind( - cmd = 'cp /tmp/http_example.py .' - ), + ).bind(profile=ContainerProfile), job( name = 'http_example_appc', container = Mesos(image=AppcImage(name='http_example', image_id='{{appc_image_id}}')) - ).bind( - cmd = 'cp /tmp/http_example.py .' - ) + ).bind(profile=ContainerProfile), + job( + name = 'http_example_gpu' + ).bind(profile=GpuProfile) ] http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora ---------------------------------------------------------------------- diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora b/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora index 08553e4..edeafbe 100644 --- a/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora +++ b/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora @@ -14,7 +14,19 @@ import getpass -DEFAULT_CMD = 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .' +class DefaultProfile(Struct): + role=Default(String, getpass.getuser()) + cmd=Default(String, 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .') + gpu=Default(Integer, 0) + + +ContainerProfile = DefaultProfile( + cmd = 'cp /tmp/http_example.py .' +) + +GpuProfile = DefaultProfile( + gpu=1 +) run_server = Process( name = 'run_server', @@ -23,12 +35,12 @@ run_server = Process( stage_server = Process( name = 'stage_server', - cmdline = '{{cmd}}' + cmdline = '{{profile.cmd}}' ) test_task = Task( name = 'http_example', - resources = Resources(cpu=0.5, ram=32*MB, disk=64*MB), + resources = Resources(cpu=0.5, ram=32*MB, disk=64*MB, gpu='{{profile.gpu}}'), processes = [stage_server, run_server], constraints = order(stage_server, run_server) ) @@ -53,34 +65,29 @@ job = Service( update_config = update_config, health_check_config = health_check_config, task = test_task, - role = getpass.getuser(), + role = '{{profile.role}}', environment = 'test', - contact = '{{role}}@localhost', + contact = '{{profile.role}}@localhost', announce = Announcer(), ) jobs = [ job( name = 'http_example' - ).bind( - cmd = DEFAULT_CMD - ), + ).bind(profile=DefaultProfile()), job( name = 'http_example_revocable', tier = 'revocable' - ).bind( - cmd = DEFAULT_CMD - ), + ).bind(profile=DefaultProfile()), job( name = 'http_example_docker', container = Container(docker=Docker(image = 'http_example')) - ).bind( - cmd = 'cp /tmp/http_example.py .' - ), + ).bind(profile=ContainerProfile), job( name = 'http_example_appc', container = Mesos(image=AppcImage(name='http_example', image_id='{{appc_image_id}}')) - ).bind( - cmd = 'cp /tmp/http_example.py .' - ) + ).bind(profile=ContainerProfile), + job( + name = 'http_example_gpu' + ).bind(profile=GpuProfile) ] http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora ---------------------------------------------------------------------- diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora b/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora index 8b3a50b..9569eec 100644 --- a/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora +++ b/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora @@ -14,7 +14,19 @@ import getpass -DEFAULT_CMD = 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .' +class DefaultProfile(Struct): + role=Default(String, getpass.getuser()) + cmd=Default(String, 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .') + gpu=Default(Integer, 0) + + +ContainerProfile = DefaultProfile( + cmd = 'cp /tmp/http_example.py .' +) + +GpuProfile = DefaultProfile( + gpu=1 +) run_server = Process( name = 'run_server', @@ -22,12 +34,12 @@ run_server = Process( stage_server = Process( name = 'stage_server', - cmdline = '{{cmd}}' + cmdline = '{{profile.cmd}}' ) test_task = SequentialTask( name = 'http_example', - resources = Resources(cpu=0.4, ram=34*MB, disk=64*MB), + resources = Resources(cpu=0.4, ram=34*MB, disk=64*MB, gpu='{{profile.gpu}}'), processes = [stage_server, run_server]) update_config = UpdateConfig(watch_secs=10, batch_size=3) @@ -39,34 +51,29 @@ job = Service( update_config = update_config, health_check_config = health_check_config, task = test_task, - role = getpass.getuser(), + role = '{{profile.role}}', environment = 'test', - contact = '{{role}}@localhost', + contact = '{{profile.role}}@localhost', announce = Announcer(), ) jobs = [ job( name = 'http_example' - ).bind( - cmd = DEFAULT_CMD - ), + ).bind(profile=DefaultProfile()), job( name = 'http_example_revocable', tier = 'revocable' - ).bind( - cmd = DEFAULT_CMD - ), + ).bind(profile=DefaultProfile()), job( name = 'http_example_docker', container = Container(docker=Docker(image = 'http_example')) - ).bind( - cmd = 'cp /tmp/http_example.py .' - ), + ).bind(profile=ContainerProfile), job( name = 'http_example_appc', container = Mesos(image=AppcImage(name='http_example', image_id='{{appc_image_id}}')) - ).bind( - cmd = 'cp /tmp/http_example.py .' - ) + ).bind(profile=ContainerProfile), + job( + name = 'http_example_gpu' + ).bind(profile=GpuProfile) ] http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh ---------------------------------------------------------------------- diff --git a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh index abe0ca7..c3c9e64 100755 --- a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh +++ b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh @@ -152,7 +152,7 @@ test_observer_ui() { test_restart() { local _jobkey=$1 - aurora job restart --batch-size=2 $_jobkey + aurora job restart --batch-size=2 --watch-secs=10 $_jobkey } assert_update_state() { @@ -343,7 +343,7 @@ test_http_example() { test_quota $_cluster $_role } -test_http_revocable_example() { +test_http_example_basic() { local _cluster=$1 _role=$2 _env=$3 local _base_config=$4 local _job=$7 @@ -429,6 +429,7 @@ TEST_ROLE=vagrant TEST_ENV=test TEST_JOB=http_example TEST_JOB_REVOCABLE=http_example_revocable +TEST_JOB_GPU=http_example_gpu TEST_JOB_DOCKER=http_example_docker TEST_JOB_APPC=http_example_appc TEST_CONFIG_FILE=$EXAMPLE_DIR/http_example.aurora @@ -450,6 +451,8 @@ TEST_JOB_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB") TEST_JOB_REVOCABLE_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_REVOCABLE") +TEST_JOB_GPU_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_GPU") + TEST_JOB_DOCKER_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_DOCKER") TEST_ADMIN_ARGS=($TEST_CLUSTER) @@ -469,7 +472,9 @@ test_version test_http_example "${TEST_JOB_ARGS[@]}" test_health_check -test_http_revocable_example "${TEST_JOB_REVOCABLE_ARGS[@]}" +test_http_example_basic "${TEST_JOB_REVOCABLE_ARGS[@]}" + +test_http_example_basic "${TEST_JOB_GPU_ARGS[@]}" # build the test docker image sudo docker build -t http_example ${TEST_ROOT}
