This is an automated email from the ASF dual-hosted git repository.
zhouquan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/submarine.git
The following commit(s) were added to refs/heads/master by this push:
new 6e19ae9 SUBMARINE-202. submarine core need to support MXNet
6e19ae9 is described below
commit 6e19ae95d0977a6fdf5708a94c5f113f67aff25c
Author: Ryan Lo <[email protected]>
AuthorDate: Thu Feb 13 15:22:45 2020 +0800
SUBMARINE-202. submarine core need to support MXNet
### What is this PR for?
To support MXNet framework in Submarine
### What type of PR is it?
[Improvement]
### Todos
* [ ] - Task
### What is the Jira issue?
[SUBMARINE-202](https://issues.apache.org/jira/projects/SUBMARINE/issues/SUBMARINE-202)
### How should this be tested?
[passed CI](https://travis-ci.org/lowc1012/submarine/builds/646785076)
### Screenshots (if appropriate)
<img width="1212" alt="screenshot1"
src="https://user-images.githubusercontent.com/52355146/73934141-d207f100-4918-11ea-86a4-920f084ff76e.png">
<img width="1214" alt="screenshot2"
src="https://user-images.githubusercontent.com/52355146/73934180-e8ae4800-4918-11ea-8c8e-6baa2c29f2af.png">
### Questions:
* Does the licenses files need update? No
* Is there breaking changes for older versions? No
* Does this needs documentation? No
Author: Ryan Lo <[email protected]>
Closes #174 from lowc1012/SUBMARINE-202 and squashes the following commits:
6af5f18 [Ryan Lo] SUBMARINE-202. submarine core need to support MXNet
---
.../apache/submarine/client/cli/CliConstants.java | 4 +
.../client/cli/param/ParametersHolder.java | 42 +++-
.../cli/param/runjob/MXNetRunJobParameters.java | 236 +++++++++++++++++++++
.../cli/param/runjob/PyTorchRunJobParameters.java | 14 ++
.../param/runjob/TensorFlowRunJobParameters.java | 27 +++
.../submarine/client/cli/param/yaml/Roles.java | 9 +
.../submarine/client/cli/runjob/RunJobCli.java | 44 +++-
.../runjob/mxnet/RunJobCliParsingMXNetTest.java | 175 +++++++++++++++
.../RunJobCliParsingMXNetYamlTest.java} | 144 +++++++------
.../pytorch/RunJobCliParsingPyTorchTest.java | 71 +++++++
.../pytorch/RunJobCliParsingPyTorchYamlTest.java | 14 +-
.../tensorflow/RunJobCliParsingTensorFlowTest.java | 67 ++++++
.../RunJobCliParsingTensorFlowYamlTest.java | 13 ++
.../runjob-mxnet-yaml/envs-are-missing.yaml | 61 ++++++
.../invalid-config-tensorboard-section.yaml | 67 ++++++
.../security-principal-is-missing.yaml | 63 ++++++
.../valid-config-with-overrides.yaml | 91 ++++++++
.../resources/runjob-mxnet-yaml/valid-config.yaml | 64 ++++++
.../runjob-mxnet-yaml/valid-gpu-config.yaml | 64 ++++++
.../invalid-config-scheduler-section.yaml | 56 +++++
.../invalid-config-scheduler-section.yaml | 61 ++++++
.../submarine/commons/runtime/Framework.java | 3 +-
.../submarine/commons/runtime/api/MXNetRole.java | 30 +--
.../submarine/server/submitter/yarn/YarnUtils.java | 175 +++++++++------
.../src/test/java/YarnUtilsTest.java | 59 ++++++
25 files changed, 1496 insertions(+), 158 deletions(-)
diff --git
a/submarine-client/src/main/java/org/apache/submarine/client/cli/CliConstants.java
b/submarine-client/src/main/java/org/apache/submarine/client/cli/CliConstants.java
index d99d56e..e89ad1b 100644
---
a/submarine-client/src/main/java/org/apache/submarine/client/cli/CliConstants.java
+++
b/submarine-client/src/main/java/org/apache/submarine/client/cli/CliConstants.java
@@ -37,6 +37,8 @@ public class CliConstants {
public static final String WORKER_RES = "worker_resources";
public static final String SERVING_RES = "serving_resources";
public static final String PS_RES = "ps_resources";
+ public static final String SCHEDULER_RES = "scheduler_resources";
+ public static final String N_SCHEDULERS = "num_schedulers";
public static final String DOCKER_IMAGE = "docker_image";
public static final String QUEUE = "queue";
public static final String TENSORBOARD = "tensorboard";
@@ -48,6 +50,7 @@ public class CliConstants {
public static final String WORKER_LAUNCH_CMD = "worker_launch_cmd";
public static final String SERVING_LAUNCH_CMD = "serving_launch_cmd";
public static final String PS_LAUNCH_CMD = "ps_launch_cmd";
+ public static final String SCHEDULER_LAUNCH_CMD = "scheduler_launch_cmd";
public static final String ENV = "env";
public static final String VERBOSE = "verbose";
public static final String SERVING_FRAMEWORK = "serving_framework";
@@ -55,6 +58,7 @@ public class CliConstants {
public static final String WAIT_JOB_FINISH = "wait_job_finish";
public static final String PS_DOCKER_IMAGE = "ps_docker_image";
public static final String WORKER_DOCKER_IMAGE = "worker_docker_image";
+ public static final String SCHEDULER_DOCKER_IMAGE = "scheduler_docker_image";
public static final String QUICKLINK = "quicklink";
public static final String TENSORBOARD_DOCKER_IMAGE =
"tensorboard_docker_image";
diff --git
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/ParametersHolder.java
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/ParametersHolder.java
index 44891f7..556dfe6 100644
---
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/ParametersHolder.java
+++
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/ParametersHolder.java
@@ -27,6 +27,7 @@ import org.apache.commons.cli.ParseException;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.submarine.client.cli.CliConstants;
import org.apache.submarine.client.cli.Command;
+import org.apache.submarine.client.cli.param.runjob.MXNetRunJobParameters;
import org.apache.submarine.client.cli.param.runjob.PyTorchRunJobParameters;
import org.apache.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
import org.apache.submarine.client.cli.param.yaml.Configs;
@@ -69,7 +70,7 @@ public final class ParametersHolder implements Parameter {
LoggerFactory.getLogger(ParametersHolder.class);
public static final String SUPPORTED_FRAMEWORKS_MESSAGE =
- "TensorFlow and PyTorch are the only supported frameworks for now!";
+ "TensorFlow, PyTorch, MXNet are the only supported frameworks for now!";
public static final String SUPPORTED_COMMANDS_MESSAGE =
"'Show job' and 'run job' are the only supported commands for now!";
@@ -108,6 +109,8 @@ public final class ParametersHolder implements Parameter {
return new TensorFlowRunJobParameters();
} else if (framework == Framework.PYTORCH) {
return new PyTorchRunJobParameters();
+ } else if (framework == Framework.MXNET) {
+ return new MXNetRunJobParameters();
} else {
throw new UnsupportedOperationException(SUPPORTED_FRAMEWORKS_MESSAGE);
}
@@ -126,11 +129,18 @@ public final class ParametersHolder implements Parameter {
"is the selected framework!");
}
- if (isCommandRunJob() && isFrameworkPyTorch() &&
- isTensorboardSectionDefined(yamlConfig)) {
+ if (isCommandRunJob() && (isFrameworkPyTorch() || isFrameworkMXNet())
+ && isTensorboardSectionDefined(yamlConfig)) {
throw new YamlParseException(
- "TensorBoard section should not be defined when PyTorch " +
- "is the selected framework!");
+ "TensorBoard section should not be defined when TensorFlow " +
+ "is not the selected framework!");
+ }
+
+ if (isCommandRunJob() && !isFrameworkMXNet() &&
+ isSchedulerSectionDefined(yamlConfig)) {
+ throw new YamlParseException(
+ "Scheduler section should not be defined when MXNet " +
+ "is not the selected framework!");
}
}
@@ -142,6 +152,10 @@ public final class ParametersHolder implements Parameter {
return framework == Framework.PYTORCH;
}
+ private boolean isFrameworkMXNet() {
+ return framework == Framework.MXNET;
+ }
+
private boolean isPsSectionDefined(YamlConfigFile yamlConfig) {
return yamlConfig != null &&
yamlConfig.getRoles() != null &&
@@ -153,6 +167,12 @@ public final class ParametersHolder implements Parameter {
yamlConfig.getTensorBoard() != null;
}
+ private boolean isSchedulerSectionDefined(YamlConfigFile yamlConfig) {
+ return yamlConfig != null &&
+ yamlConfig.getRoles() != null &&
+ yamlConfig.getRoles().getScheduler() != null;
+ }
+
private Framework determineFrameworkType()
throws ParseException, YarnException {
if (!isCommandRunJob()) {
@@ -195,6 +215,7 @@ public final class ParametersHolder implements Parameter {
initGenericConfigs(yamlConfig, yamlConfigValues);
initPs(yamlConfigValues, roles.getPs());
initWorker(yamlConfigValues, roles.getWorker());
+ initScheduler(yamlConfigValues, roles.getScheduler());
initScheduling(yamlConfigValues, yamlConfig.getScheduling());
initSecurity(yamlConfigValues, yamlConfig.getSecurity());
initTensorBoard(yamlConfigValues, yamlConfig.getTensorBoard());
@@ -253,6 +274,17 @@ public final class ParametersHolder implements Parameter {
yamlConfigs.put(CliConstants.WORKER_LAUNCH_CMD, worker.getLaunchCmd());
}
+ private void initScheduler(Map<String, String> yamlConfigs, Role scheduler) {
+ if (scheduler == null) {
+ return;
+ }
+ yamlConfigs.put(CliConstants.N_SCHEDULERS,
+ String.valueOf(scheduler.getReplicas()));
+ yamlConfigs.put(CliConstants.SCHEDULER_RES, scheduler.getResources());
+ yamlConfigs.put(CliConstants.SCHEDULER_DOCKER_IMAGE,
scheduler.getDockerImage());
+ yamlConfigs.put(CliConstants.SCHEDULER_LAUNCH_CMD,
scheduler.getLaunchCmd());
+ }
+
private void initScheduling(Map<String, String> yamlConfigValues,
Scheduling scheduling) {
if (scheduling == null) {
diff --git
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/MXNetRunJobParameters.java
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/MXNetRunJobParameters.java
new file mode 100644
index 0000000..4c4671b
--- /dev/null
+++
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/MXNetRunJobParameters.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.submarine.client.cli.param.runjob;
+
+import com.google.common.collect.Lists;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.submarine.client.cli.CliConstants;
+import org.apache.submarine.client.cli.CliUtils;
+import org.apache.submarine.client.cli.runjob.RoleParameters;
+import org.apache.submarine.commons.runtime.ClientContext;
+import org.apache.submarine.commons.runtime.api.MXNetRole;
+import org.apache.submarine.commons.runtime.param.Parameter;
+import org.apache.submarine.commons.runtime.resource.ResourceUtils;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * Parameters for MXNet job.
+ */
+public class MXNetRunJobParameters extends RunJobParameters {
+ private RoleParameters psParameters =
+ RoleParameters.createEmpty(MXNetRole.PS);
+
+ private RoleParameters schedulerParameters =
+ RoleParameters.createEmpty(MXNetRole.SCHEDULER);
+
+ private static final String CANNOT_BE_DEFINED_FOR_MXNET =
+ "cannot be defined for MXNet jobs!";
+
+ @Override
+ public void updateParameters(Parameter parametersHolder, ClientContext
clientContext)
+ throws ParseException, IOException, YarnException {
+
+ checkArguments(parametersHolder);
+ super.updateParameters(parametersHolder, clientContext);
+
+ String input =
parametersHolder.getOptionValue(CliConstants.INPUT_PATH);
+ this.workerParameters = generateWorkerParameters(clientContext,
parametersHolder, input);
+ this.psParameters = getPSParameters(parametersHolder);
+ this.schedulerParameters = getSchedulerParameters(parametersHolder);
+ this.distributed =
determineIfDistributed(workerParameters.getReplicas(),
+ psParameters.getReplicas(), schedulerParameters.getReplicas());
+
+ executePostOperations(clientContext);
+ }
+
+ @Override
+ void executePostOperations(ClientContext clientContext) throws IOException
{
+ // Set default job dir / saved model dir, etc.
+ setDefaultDirs(clientContext);
+ replacePatternsInParameters(clientContext);
+ }
+
+ @Override
+ public List<String> getLaunchCommands() {
+ return Lists.newArrayList(getWorkerLaunchCmd(), getPSLaunchCmd(),
getSchedulerLaunchCmd());
+ }
+
+ private void replacePatternsInParameters(ClientContext clientContext)
+ throws IOException {
+ if (StringUtils.isNotEmpty(getPSLaunchCmd())) {
+ String afterReplace =
+ CliUtils.replacePatternsInLaunchCommand(getPSLaunchCmd(),
this,
+ clientContext.getRemoteDirectoryManager());
+ setPSLaunchCmd(afterReplace);
+ }
+ if (StringUtils.isNotEmpty(getWorkerLaunchCmd())) {
+ String afterReplace =
+
CliUtils.replacePatternsInLaunchCommand(getWorkerLaunchCmd(), this,
+ clientContext.getRemoteDirectoryManager());
+ setWorkerLaunchCmd(afterReplace);
+ }
+ if (StringUtils.isNotEmpty(getSchedulerLaunchCmd())) {
+ String afterReplace =
+
CliUtils.replacePatternsInLaunchCommand(getSchedulerLaunchCmd(), this,
+ clientContext.getRemoteDirectoryManager());
+ setSchedulerLaunchCmd(afterReplace);
+ }
+ }
+
+ private void checkArguments(Parameter parametersHolder)
+ throws YarnException, ParseException {
+ if (parametersHolder.hasOption(CliConstants.TENSORBOARD)) {
+ throw new ParseException(getParamCannotBeDefinedErrorMessage(
+ CliConstants.TENSORBOARD));
+ } else if (parametersHolder
+ .getOptionValue(CliConstants.TENSORBOARD_RESOURCES) != null) {
+ throw new ParseException(getParamCannotBeDefinedErrorMessage(
+ CliConstants.TENSORBOARD_RESOURCES));
+ } else if (parametersHolder
+ .getOptionValue(CliConstants.TENSORBOARD_DOCKER_IMAGE) !=
null) {
+ throw new ParseException(getParamCannotBeDefinedErrorMessage(
+ CliConstants.TENSORBOARD_DOCKER_IMAGE));
+ }
+ }
+
+ private boolean determineIfDistributed(int nWorkers, int nPS, int
nSchedulers) {
+ return nWorkers >= 2 && nPS > 0 && nSchedulers == 1;
+ }
+
+ private String getParamCannotBeDefinedErrorMessage(String cliName) {
+ return String.format(
+ "Parameter '%s' " + CANNOT_BE_DEFINED_FOR_MXNET, cliName);
+ }
+
+ private RoleParameters getPSParameters(Parameter parametersHolder)
+ throws YarnException, ParseException {
+ int nPS = getNumberOfPS(parametersHolder);
+ Resource psResource =
+ determinePSResource(parametersHolder, nPS);
+ String psDockerImage =
+ parametersHolder.getOptionValue(CliConstants.PS_DOCKER_IMAGE);
+ String psLaunchCommand =
+ parametersHolder.getOptionValue(CliConstants.PS_LAUNCH_CMD);
+ return new RoleParameters(MXNetRole.PS, nPS, psLaunchCommand,
+ psDockerImage, psResource);
+ }
+
+ private Resource determinePSResource(Parameter parametersHolder, int nPS)
+ throws ParseException, YarnException {
+ if (nPS > 0) {
+ String psResourceStr =
+ parametersHolder.getOptionValue(CliConstants.PS_RES);
+ if (psResourceStr == null) {
+ throw new ParseException("--" + CliConstants.PS_RES + " is
absent.");
+ }
+ return ResourceUtils.createResourceFromString(psResourceStr);
+ }
+ return null;
+ }
+
+ public String getPSLaunchCmd() {
+ return psParameters.getLaunchCommand();
+ }
+
+ public void setPSLaunchCmd(String launchCmd) {
+ psParameters.setLaunchCommand(launchCmd);
+ }
+
+ private int getNumberOfPS(Parameter parametersHolder) throws YarnException
{
+ int nPS = 0;
+ if (parametersHolder.getOptionValue(CliConstants.N_PS) != null) {
+ nPS =
Integer.parseInt(parametersHolder.getOptionValue(CliConstants.N_PS));
+ }
+ return nPS;
+ }
+
+ private RoleParameters getSchedulerParameters(Parameter parametersHolder)
+ throws YarnException, ParseException {
+ int nSchedulers = getNumberOfScheduler(parametersHolder);
+ Resource schedulerResource =
+ determineSchedulerResource(parametersHolder, nSchedulers);
+ String schedulerDockerImage =
+
parametersHolder.getOptionValue(CliConstants.SCHEDULER_DOCKER_IMAGE);
+ String schedulerLaunchCommand =
+
parametersHolder.getOptionValue(CliConstants.SCHEDULER_LAUNCH_CMD);
+ return new RoleParameters(MXNetRole.SCHEDULER, nSchedulers,
schedulerLaunchCommand,
+ schedulerDockerImage, schedulerResource);
+ }
+
+ private Resource determineSchedulerResource(Parameter parametersHolder,
int nSchedulers)
+ throws ParseException, YarnException {
+ if (nSchedulers > 0) {
+ String schedulerResourceStr =
parametersHolder.getOptionValue(CliConstants.SCHEDULER_RES);
+ if (schedulerResourceStr == null) {
+ throw new ParseException("--" + CliConstants.SCHEDULER_RES + "
is absent.");
+ }
+ return
ResourceUtils.createResourceFromString(schedulerResourceStr);
+ }
+ return null;
+ }
+
+ private int getNumberOfScheduler(Parameter parametersHolder) throws
ParseException, YarnException {
+ int nSchedulers = 0;
+ if (parametersHolder.getOptionValue(CliConstants.N_SCHEDULERS) !=
null) {
+ nSchedulers =
Integer.parseInt(parametersHolder.getOptionValue(CliConstants.N_SCHEDULERS));
+ if (nSchedulers > 1 || nSchedulers < 0) {
+ throw new ParseException("--" + CliConstants.N_SCHEDULERS + "
should be 1 or 0");
+ }
+ }
+ return nSchedulers;
+ }
+
+ public String getSchedulerLaunchCmd() {
+ return schedulerParameters.getLaunchCommand();
+ }
+
+ public void setSchedulerLaunchCmd(String launchCmd) {
+ schedulerParameters.setLaunchCommand(launchCmd);
+ }
+
+ public int getNumPS() {
+ return psParameters.getReplicas();
+ }
+
+ public Resource getPsResource() {
+ return psParameters.getResource();
+ }
+
+ public String getPsDockerImage() {
+ return psParameters.getDockerImage();
+ }
+
+ public int getNumSchedulers() {
+ return schedulerParameters.getReplicas();
+ }
+
+ public Resource getSchedulerResource() {
+ return schedulerParameters.getResource();
+ }
+
+ public String getSchedulerDockerImage() {
+ return schedulerParameters.getDockerImage();
+ }
+}
diff --git
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/PyTorchRunJobParameters.java
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/PyTorchRunJobParameters.java
index 80767cf..ad0b64f 100644
---
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/PyTorchRunJobParameters.java
+++
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/PyTorchRunJobParameters.java
@@ -81,6 +81,20 @@ public class PyTorchRunJobParameters extends
RunJobParameters {
.getOptionValue(CliConstants.TENSORBOARD_DOCKER_IMAGE) != null) {
throw new ParseException(getParamCannotBeDefinedErrorMessage(
CliConstants.TENSORBOARD_DOCKER_IMAGE));
+ } else if (parametersHolder.getOptionValue(CliConstants.N_SCHEDULERS) !=
null) {
+ throw new ParseException(getParamCannotBeDefinedErrorMessage(
+ CliConstants.N_SCHEDULERS));
+ } else if (parametersHolder.getOptionValue(CliConstants.SCHEDULER_RES) !=
null) {
+ throw new ParseException(getParamCannotBeDefinedErrorMessage(
+ CliConstants.SCHEDULER_RES));
+ } else if (parametersHolder
+ .getOptionValue(CliConstants.SCHEDULER_DOCKER_IMAGE) != null) {
+ throw new ParseException(getParamCannotBeDefinedErrorMessage(
+ CliConstants.SCHEDULER_DOCKER_IMAGE));
+ } else if (parametersHolder
+ .getOptionValue(CliConstants.SCHEDULER_LAUNCH_CMD) != null) {
+ throw new ParseException(getParamCannotBeDefinedErrorMessage(
+ CliConstants.SCHEDULER_LAUNCH_CMD));
}
}
diff --git
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/TensorFlowRunJobParameters.java
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/TensorFlowRunJobParameters.java
index 06f2bc0..925c3f3 100644
---
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/TensorFlowRunJobParameters.java
+++
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/TensorFlowRunJobParameters.java
@@ -40,6 +40,8 @@ import java.util.List;
*/
public class TensorFlowRunJobParameters extends RunJobParameters {
private boolean tensorboardEnabled;
+ private static final String CANNOT_BE_DEFINED_FOR_TF =
+ "cannot be defined for TensorFlow jobs!";
private RoleParameters psParameters =
RoleParameters.createEmpty(TensorFlowRole.PS);
private RoleParameters tensorBoardParameters =
@@ -48,6 +50,7 @@ public class TensorFlowRunJobParameters extends
RunJobParameters {
@Override
public void updateParameters(Parameter parametersHolder, ClientContext
clientContext)
throws ParseException, IOException, YarnException {
+ checkArguments(parametersHolder);
super.updateParameters(parametersHolder, clientContext);
String input = parametersHolder.getOptionValue(CliConstants.INPUT_PATH);
@@ -72,6 +75,30 @@ public class TensorFlowRunJobParameters extends
RunJobParameters {
replacePatternsInParameters(clientContext);
}
+ private void checkArguments(Parameter parametersHolder)
+ throws YarnException, ParseException {
+ if (parametersHolder.getOptionValue(CliConstants.N_SCHEDULERS) != null) {
+ throw new ParseException(getParamCannotBeDefinedErrorMessage(
+ CliConstants.N_SCHEDULERS));
+ } else if (parametersHolder.getOptionValue(CliConstants.SCHEDULER_RES) !=
null) {
+ throw new ParseException(getParamCannotBeDefinedErrorMessage(
+ CliConstants.SCHEDULER_RES));
+ } else if (parametersHolder
+ .getOptionValue(CliConstants.SCHEDULER_DOCKER_IMAGE) != null) {
+ throw new ParseException(getParamCannotBeDefinedErrorMessage(
+ CliConstants.SCHEDULER_DOCKER_IMAGE));
+ } else if (parametersHolder
+ .getOptionValue(CliConstants.SCHEDULER_LAUNCH_CMD) != null) {
+ throw new ParseException(getParamCannotBeDefinedErrorMessage(
+ CliConstants.SCHEDULER_LAUNCH_CMD));
+ }
+ }
+
+ private String getParamCannotBeDefinedErrorMessage(String cliName) {
+ return String.format(
+ "Parameter '%s' " + CANNOT_BE_DEFINED_FOR_TF, cliName);
+ }
+
private void replacePatternsInParameters(ClientContext clientContext)
throws IOException {
if (StringUtils.isNotEmpty(getPSLaunchCmd())) {
diff --git
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
index fab6f31..810c36f 100644
---
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
+++
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
@@ -25,6 +25,7 @@ package org.apache.submarine.client.cli.param.yaml;
public class Roles {
private Role worker;
private Role ps;
+ private Role scheduler;
public Role getWorker() {
return worker;
@@ -41,4 +42,12 @@ public class Roles {
public void setPs(Role ps) {
this.ps = ps;
}
+
+ public Role getScheduler() {
+ return scheduler;
+ }
+
+ public void setScheduler(Role scheduler) {
+ this.scheduler = scheduler;
+ }
}
diff --git
a/submarine-client/src/main/java/org/apache/submarine/client/cli/runjob/RunJobCli.java
b/submarine-client/src/main/java/org/apache/submarine/client/cli/runjob/RunJobCli.java
index 69426d5..c317d3f 100644
---
a/submarine-client/src/main/java/org/apache/submarine/client/cli/runjob/RunJobCli.java
+++
b/submarine-client/src/main/java/org/apache/submarine/client/cli/runjob/RunJobCli.java
@@ -60,10 +60,14 @@ import java.util.Map;
public class RunJobCli extends AbstractCli {
private static final Logger LOG =
LoggerFactory.getLogger(RunJobCli.class);
- private static final String CAN_BE_USED_WITH_TF_PYTORCH =
- "Can be used with TensorFlow or PyTorch frameworks.";
+ private static final String CAN_BE_USED_WITH_TF_PYTORCH_MXNET =
+ "Can be used with TensorFlow or PyTorch or MXNet frameworks.";
+ private static final String CAN_BE_USED_WITH_TF_MXNET =
+ "Can be used with TensorFlow or MXNet frameworks.";
private static final String CAN_BE_USED_WITH_TF_ONLY =
"Can only be used with TensorFlow framework.";
+ private static final String CAN_BE_USED_WITH_MXNET_ONLY =
+ "Can only be used with MXNet framework.";
public static final String YAML_PARSE_FAILED = "Failed to parse " +
"YAML config";
@@ -117,6 +121,7 @@ public class RunJobCli extends AbstractCli {
addWorkerOptions(options);
addPSOptions(options);
+ addSchedulerOptions(options);
addTensorboardOptions(options);
options.addOption(CliConstants.ENV, true,
@@ -170,37 +175,37 @@ public class RunJobCli extends AbstractCli {
private void addWorkerOptions(Options options) {
options.addOption(CliConstants.N_WORKERS, true,
"Number of worker tasks of the job, by default it's 1." +
- CAN_BE_USED_WITH_TF_PYTORCH);
+ CAN_BE_USED_WITH_TF_PYTORCH_MXNET);
options.addOption(CliConstants.WORKER_DOCKER_IMAGE, true,
"Specify docker image for WORKER, when this is not specified, WORKER "
+ "uses --" + CliConstants.DOCKER_IMAGE + " as default." +
- CAN_BE_USED_WITH_TF_PYTORCH);
+ CAN_BE_USED_WITH_TF_PYTORCH_MXNET);
options.addOption(CliConstants.WORKER_LAUNCH_CMD, true,
"Commandline of worker, arguments will be "
+ "directly used to launch the worker" +
- CAN_BE_USED_WITH_TF_PYTORCH);
+ CAN_BE_USED_WITH_TF_PYTORCH_MXNET);
options.addOption(CliConstants.WORKER_RES, true,
"Resource of each worker, for example "
+ "memory-mb=2048,vcores=2,yarn.io/gpu=2" +
- CAN_BE_USED_WITH_TF_PYTORCH);
+ CAN_BE_USED_WITH_TF_PYTORCH_MXNET);
}
private void addPSOptions(Options options) {
options.addOption(CliConstants.N_PS, true,
"Number of PS tasks of the job, by default it's 0. " +
- CAN_BE_USED_WITH_TF_ONLY);
+ CAN_BE_USED_WITH_TF_MXNET);
options.addOption(CliConstants.PS_DOCKER_IMAGE, true,
"Specify docker image for PS, when this is not specified, PS uses --"
+ CliConstants.DOCKER_IMAGE + " as default." +
- CAN_BE_USED_WITH_TF_ONLY);
+ CAN_BE_USED_WITH_TF_MXNET);
options.addOption(CliConstants.PS_LAUNCH_CMD, true,
- "Commandline of worker, arguments will be "
+ "Commandline of PS, arguments will be "
+ "directly used to launch the PS" +
- CAN_BE_USED_WITH_TF_ONLY);
+ CAN_BE_USED_WITH_TF_MXNET);
options.addOption(CliConstants.PS_RES, true,
"Resource of each PS, for example "
+ "memory-mb=2048,vcores=2,yarn.io/gpu=2" +
- CAN_BE_USED_WITH_TF_ONLY);
+ CAN_BE_USED_WITH_TF_MXNET);
}
private void addTensorboardOptions(Options options) {
@@ -219,6 +224,23 @@ public class RunJobCli extends AbstractCli {
CAN_BE_USED_WITH_TF_ONLY);
}
+ private void addSchedulerOptions(Options options) {
+ options.addOption(CliConstants.N_SCHEDULERS, true,
+ "Number of scheduler tasks of the job. " +
+ "It should be 1 or 0, by default it's 0."+
+ CAN_BE_USED_WITH_MXNET_ONLY);
+ options.addOption(CliConstants.SCHEDULER_DOCKER_IMAGE, true,
+ "Specify docker image for scheduler, when this is not specified, " +
+ "scheduler uses --" + CliConstants.DOCKER_IMAGE +
+ " as default. " + CAN_BE_USED_WITH_MXNET_ONLY);
+ options.addOption(CliConstants.SCHEDULER_LAUNCH_CMD, true,
+ "Commandline of scheduler, arguments will be " +
+ "directly used to launch the scheduler. " +
CAN_BE_USED_WITH_MXNET_ONLY);
+ options.addOption(CliConstants.SCHEDULER_RES, true,
+ "Resource of each scheduler, for example " +
+ "memory-mb=2048,vcores=2. " + CAN_BE_USED_WITH_MXNET_ONLY);
+ }
+
private void parseCommandLineAndGetRunJobParameters(String[] args)
throws ParseException, IOException, YarnException {
try {
diff --git
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/mxnet/RunJobCliParsingMXNetTest.java
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/mxnet/RunJobCliParsingMXNetTest.java
new file mode 100644
index 0000000..c6c0678
--- /dev/null
+++
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/mxnet/RunJobCliParsingMXNetTest.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.submarine.client.cli.runjob.mxnet;
+
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.yarn.util.resource.Resources;
+import org.apache.submarine.client.cli.param.runjob.MXNetRunJobParameters;
+import org.apache.submarine.client.cli.param.runjob.RunJobParameters;
+import org.apache.submarine.client.cli.runjob.RunJobCli;
+import org.apache.submarine.client.cli.runjob.RunJobCliParsingCommonTest;
+import org.apache.submarine.commons.runtime.conf.SubmarineLogs;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import static org.junit.Assert.*;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Test class that verifies the correctness of MXNet
+ * CLI configuration parsing.
+ */
+public class RunJobCliParsingMXNetTest {
+ @Before
+ public void before() {
+ SubmarineLogs.verboseOff();
+ }
+
+ @Rule
+ public ExpectedException expectedException = ExpectedException.none();
+
+ @Test
+ public void testBasicRunJobForSingleNodeTraining() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+
+ runJobCli.run(
+ new String[]{"--framework", "mxnet", "--name", "my-job",
+ "--docker_image", "mxnet-docker:1.1.0",
+ "--input_path", "hdfs://input",
+ "--num_workers", "1", "--num_ps", "1", "--worker_launch_cmd",
+ "python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
+ "--ps_resources", "memory=4G,vcores=2", "--ps_launch_cmd",
+ "python run-ps.py", "--num_schedulers", "1",
"--scheduler_launch_cmd",
+ "python run-scheduler.py", "--scheduler_resources",
"memory=1024M,vcores=2",
+ "--verbose", "--wait_job_finish"});
+
+ RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
+ assertTrue(RunJobParameters.class +
+ " must be an instance of " +
+ MXNetRunJobParameters.class,
+ jobRunParameters instanceof MXNetRunJobParameters);
+ MXNetRunJobParameters mxNetParams =
+ (MXNetRunJobParameters) jobRunParameters;
+
+ assertEquals(jobRunParameters.getInputPath(), "hdfs://input");
+ assertEquals(mxNetParams.getNumWorkers(), 1);
+ assertEquals(mxNetParams.getWorkerLaunchCmd(), "python run-job.py");
+ assertEquals(Resources.createResource(2048, 2),
+ mxNetParams.getWorkerResource());
+ assertEquals(mxNetParams.getNumPS(), 1);
+ assertEquals(mxNetParams.getNumSchedulers(), 1);
+ assertTrue(SubmarineLogs.isVerbose());
+ assertTrue(jobRunParameters.isWaitJobFinish());
+ }
+
+ @Test
+ public void testBasicRunJobForDistributedTraining() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+ runJobCli.run(
+ new String[]{"--framework", "mxnet", "--name", "my-job",
+ "--docker_image", "mxnet-docker:1.1.0",
+ "--input_path", "hdfs://input",
+ "--num_workers", "2", "--num_ps", "2", "--worker_launch_cmd",
+ "python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
+ "--ps_resources", "memory=4G,vcores=2", "--ps_launch_cmd",
+ "python run-ps.py", "--num_schedulers", "1",
"--scheduler_launch_cmd",
+ "python run-scheduler.py", "--scheduler_resources",
"memory=1024M,vcores=2",
+ "--verbose"});
+ RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
+ assertTrue(RunJobParameters.class +
+ " must be an instance of " +
+ MXNetRunJobParameters.class,
+ jobRunParameters instanceof MXNetRunJobParameters);
+ MXNetRunJobParameters mxNetParams =
+ (MXNetRunJobParameters) jobRunParameters;
+
+ assertEquals(jobRunParameters.getInputPath(), "hdfs://input");
+ assertEquals(jobRunParameters.getDockerImageName(), "mxnet-docker:1.1.0");
+ assertEquals(mxNetParams.getNumWorkers(), 2);
+ assertEquals(Resources.createResource(2048, 2),
+ mxNetParams.getWorkerResource());
+ assertEquals(mxNetParams.getWorkerLaunchCmd(), "python run-job.py");
+ assertEquals(mxNetParams.getNumPS(), 2);
+ assertEquals(Resources.createResource(4096, 2),
+ mxNetParams.getPsResource());
+ assertEquals(mxNetParams.getPSLaunchCmd(), "python run-ps.py");
+ assertEquals(mxNetParams.getNumSchedulers(), 1);
+ assertEquals(Resources.createResource(1024, 2),
+ mxNetParams.getSchedulerResource());
+ assertEquals(mxNetParams.getSchedulerLaunchCmd(), "python
run-scheduler.py");
+ assertTrue(SubmarineLogs.isVerbose());
+ }
+
+ @Test
+ public void testTensorboardCannotBeDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+
+ expectedException.expect(ParseException.class);
+ expectedException.expectMessage("cannot be defined for MXNet jobs");
+ runJobCli.run(
+ new String[]{"--framework", "mxnet",
+ "--name", "my-job", "--docker_image", "mxnet-docker:1.1.0",
+ "--input_path", "hdfs://input",
+ "--num_workers", "2",
+ "--worker_launch_cmd", "python run-job.py",
+ "--worker_resources", "memory=2048M,vcores=2",
+ "--tensorboard"});
+ }
+
+ @Test
+ public void testTensorboardResourcesCannotBeDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+
+ expectedException.expect(ParseException.class);
+ expectedException.expectMessage("cannot be defined for MXNet jobs");
+ runJobCli.run(
+ new String[]{"--framework", "mxnet",
+ "--name", "my-job", "--docker_image", "mxnet-docker:1.1.0",
+ "--input_path", "hdfs://input",
+ "--num_workers", "2",
+ "--worker_launch_cmd", "python run-job.py",
+ "--worker_resources", "memory=2048M,vcores=2",
+ "--tensorboard_resources", "memory=1024M,vcores=2"});
+ }
+
+ @Test
+ public void testTensorboardDockerImageCannotBeDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+
+ expectedException.expect(ParseException.class);
+ expectedException.expectMessage("cannot be defined for MXNet jobs");
+ runJobCli.run(
+ new String[]{"--framework", "mxnet",
+ "--name", "my-job", "--docker_image", "mxnet-docker:1.1.0",
+ "--input_path", "hdfs://input",
+ "--num_workers", "2",
+ "--worker_launch_cmd", "python run-job.py",
+ "--worker_resources", "memory=2048M,vcores=2",
+ "--tensorboard_docker_image", "TBDockerImage"});
+ }
+}
+
diff --git
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/mxnet/RunJobCliParsingMXNetYamlTest.java
similarity index 71%
copy from
submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
copy to
submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/mxnet/RunJobCliParsingMXNetYamlTest.java
index 6c9c5b8..0aa60f6 100644
---
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
+++
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/mxnet/RunJobCliParsingMXNetYamlTest.java
@@ -17,48 +17,41 @@
* under the License.
*/
-package org.apache.submarine.client.cli.runjob.pytorch;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.util.List;
+package org.apache.submarine.client.cli.runjob.mxnet;
+import com.google.common.collect.ImmutableList;
import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.util.resource.Resources;
import org.apache.submarine.client.cli.YamlConfigTestUtils;
-import org.apache.submarine.client.cli.param.runjob.PyTorchRunJobParameters;
+import org.apache.submarine.client.cli.param.runjob.MXNetRunJobParameters;
import org.apache.submarine.client.cli.param.runjob.RunJobParameters;
import org.apache.submarine.client.cli.param.yaml.YamlParseException;
import org.apache.submarine.client.cli.runjob.RunJobCli;
+import org.apache.submarine.client.cli.runjob.RunJobCliParsingCommonTest;
import org.apache.submarine.commons.runtime.conf.SubmarineLogs;
import
org.apache.submarine.commons.runtime.exception.SubmarineRuntimeException;
import org.apache.submarine.commons.runtime.resource.ResourceUtils;
-import org.apache.hadoop.yarn.util.resource.Resources;
-import org.apache.submarine.client.cli.runjob.RunJobCliParsingCommonTest;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
+import org.junit.*;
import org.junit.rules.ExpectedException;
-
-import com.google.common.collect.ImmutableList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.File;
+import java.util.List;
+
+import static org.junit.Assert.*;
+import static org.junit.Assert.assertNotNull;
+
/**
- * Test class that verifies the correctness of PyTorch
+ * Test class that verifies the correctness of MXNet
* YAML configuration parsing.
*/
-public class RunJobCliParsingPyTorchYamlTest {
+public class RunJobCliParsingMXNetYamlTest {
private static final String OVERRIDDEN_PREFIX = "overridden_";
- private static final String DIR_NAME = "runjob-pytorch-yaml";
+ private static final String DIR_NAME = "runjob-mxnet-yaml";
private File yamlConfig;
private static Logger LOG = LoggerFactory.getLogger(
- RunJobCliParsingPyTorchYamlTest.class);
+ RunJobCliParsingMXNetYamlTest.class);
@Before
public void before() {
@@ -82,7 +75,7 @@ public class RunJobCliParsingPyTorchYamlTest {
List<String> expectedEnvs) {
assertEquals("testInputPath", jobRunParameters.getInputPath());
assertEquals("testCheckpointPath", jobRunParameters.getCheckpointPath());
- Assert.assertEquals("testDockerImage",
jobRunParameters.getDockerImageName());
+ assertEquals("testDockerImage", jobRunParameters.getDockerImageName());
assertNotNull(jobRunParameters.getLocalizations());
assertEquals(2, jobRunParameters.getLocalizations().size());
@@ -100,38 +93,64 @@ public class RunJobCliParsingPyTorchYamlTest {
}
}
- private PyTorchRunJobParameters verifyWorkerCommonValues(RunJobParameters
- jobRunParameters, String prefix) {
+ private void verifyPsValues(RunJobParameters jobRunParameters,
+ String prefix) {
assertTrue(RunJobParameters.class + " must be an instance of " +
- PyTorchRunJobParameters.class,
- jobRunParameters instanceof PyTorchRunJobParameters);
- PyTorchRunJobParameters pyTorchParams =
- (PyTorchRunJobParameters) jobRunParameters;
-
- assertEquals(3, pyTorchParams.getNumWorkers());
- assertEquals(prefix + "testLaunchCmdWorker",
- pyTorchParams.getWorkerLaunchCmd());
- assertEquals(prefix + "testDockerImageWorker",
- pyTorchParams.getWorkerDockerImage());
- return pyTorchParams;
+ MXNetRunJobParameters.class,
+ jobRunParameters instanceof MXNetRunJobParameters);
+ MXNetRunJobParameters mxNetParams =
+ (MXNetRunJobParameters) jobRunParameters;
+
+ assertEquals(4, mxNetParams.getNumPS());
+ assertEquals(prefix + "testLaunchCmdPs", mxNetParams.getPSLaunchCmd());
+ assertEquals(prefix + "testDockerImagePs",
+ mxNetParams.getPsDockerImage());
+ assertEquals(Resources.createResource(20500, 34),
+ mxNetParams.getPsResource());
}
- private void verifyWorkerValues(RunJobParameters jobRunParameters,
+ private void verifySchedulerValues(RunJobParameters jobRunParameters,
String prefix) {
- PyTorchRunJobParameters pyTorchParams = verifyWorkerCommonValues
- (jobRunParameters, prefix);
+ assertTrue(RunJobParameters.class + " must be an instance of " +
+ MXNetRunJobParameters.class, jobRunParameters instanceof
MXNetRunJobParameters);
+ MXNetRunJobParameters mxNetParams = (MXNetRunJobParameters)
jobRunParameters;
+ assertEquals(1, mxNetParams.getNumSchedulers());
+ assertEquals(prefix + "testLaunchCmdScheduler",
+ mxNetParams.getSchedulerLaunchCmd());
+ assertEquals(prefix + "testDockerImageScheduler",
mxNetParams.getSchedulerDockerImage());
+ assertEquals(Resources.createResource(10240, 16),
+ mxNetParams.getSchedulerResource());
+ }
+
+ private void verifyWorkerValues(RunJobParameters jobRunParameters, String
prefix) {
+ MXNetRunJobParameters mxNetParams =
+ verifyWorkerCommonValues(jobRunParameters, prefix);
assertEquals(Resources.createResource(20480, 32),
- pyTorchParams.getWorkerResource());
+ mxNetParams.getWorkerResource());
}
- private void verifyWorkerValuesWithGpu(RunJobParameters jobRunParameters,
- String prefix) {
+ private MXNetRunJobParameters verifyWorkerCommonValues(
+ RunJobParameters jobRunParameters, String prefix) {
+ assertTrue(RunJobParameters.class + " must be an instance of " +
+ MXNetRunJobParameters.class,
+ jobRunParameters instanceof MXNetRunJobParameters);
+ MXNetRunJobParameters mxNetParams =
+ (MXNetRunJobParameters) jobRunParameters;
- PyTorchRunJobParameters pyTorchParams = verifyWorkerCommonValues
- (jobRunParameters, prefix);
+ assertEquals(3, mxNetParams.getNumWorkers());
+ assertEquals(prefix + "testLaunchCmdWorker",
+ mxNetParams.getWorkerLaunchCmd());
+ assertEquals(prefix + "testDockerImageWorker",
+ mxNetParams.getWorkerDockerImage());
+ return mxNetParams;
+ }
+
+ private void verifyWorkerValuesWithGpu(RunJobParameters jobRunParameters,
String prefix) {
+ MXNetRunJobParameters mxNetParams =
+ verifyWorkerCommonValues(jobRunParameters, prefix);
Resource workResource = Resources.createResource(20480, 32);
ResourceUtils.setResource(workResource, ResourceUtils.GPU_URI, 2);
- assertEquals(workResource, pyTorchParams.getWorkerResource());
+ assertEquals(workResource, mxNetParams.getWorkerResource());
}
private void verifySecurityValues(RunJobParameters jobRunParameters) {
@@ -149,9 +168,10 @@ public class RunJobCliParsingPyTorchYamlTest {
DIR_NAME + "/valid-config.yaml");
runJobCli.run(
new String[] {"-f", yamlConfig.getAbsolutePath(), "--verbose"});
-
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
verifyBasicConfigValues(jobRunParameters);
+ verifyPsValues(jobRunParameters, "");
+ verifySchedulerValues(jobRunParameters, "");
verifyWorkerValues(jobRunParameters, "");
verifySecurityValues(jobRunParameters);
}
@@ -176,6 +196,8 @@ public class RunJobCliParsingPyTorchYamlTest {
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
verifyBasicConfigValues(jobRunParameters);
+ verifyPsValues(jobRunParameters, "");
+ verifySchedulerValues(jobRunParameters, "");
verifyWorkerValuesWithGpu(jobRunParameters, "");
verifySecurityValues(jobRunParameters);
}
@@ -187,11 +209,14 @@ public class RunJobCliParsingPyTorchYamlTest {
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
DIR_NAME + "/valid-config-with-overrides.yaml");
+
runJobCli.run(
new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
verifyBasicConfigValues(jobRunParameters);
+ verifyPsValues(jobRunParameters, OVERRIDDEN_PREFIX);
+ verifySchedulerValues(jobRunParameters, OVERRIDDEN_PREFIX);
verifyWorkerValues(jobRunParameters, OVERRIDDEN_PREFIX);
verifySecurityValues(jobRunParameters);
}
@@ -199,7 +224,6 @@ public class RunJobCliParsingPyTorchYamlTest {
@Test
public void testMissingPrincipalUnderSecuritySection() throws Exception {
RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
-
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
DIR_NAME + "/security-principal-is-missing.yaml");
runJobCli.run(
@@ -207,6 +231,8 @@ public class RunJobCliParsingPyTorchYamlTest {
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
verifyBasicConfigValues(jobRunParameters);
+ verifyPsValues(jobRunParameters, "");
+ verifySchedulerValues(jobRunParameters, "");
verifyWorkerValues(jobRunParameters, "");
//Verify security values
@@ -218,7 +244,6 @@ public class RunJobCliParsingPyTorchYamlTest {
@Test
public void testMissingEnvs() throws Exception {
RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
-
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
DIR_NAME + "/envs-are-missing.yaml");
runJobCli.run(
@@ -226,34 +251,21 @@ public class RunJobCliParsingPyTorchYamlTest {
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
verifyBasicConfigValues(jobRunParameters, ImmutableList.of());
+ verifyPsValues(jobRunParameters, "");
+ verifySchedulerValues(jobRunParameters, "");
verifyWorkerValues(jobRunParameters, "");
verifySecurityValues(jobRunParameters);
}
@Test
- public void testInvalidConfigPsSectionIsDefined() throws Exception {
- RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
-
- exception.expect(YamlParseException.class);
- exception.expectMessage("PS section should not be defined " +
- "when PyTorch is the selected framework");
- yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
- DIR_NAME + "/invalid-config-ps-section.yaml");
- runJobCli.run(
- new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
- }
-
- @Test
public void testInvalidConfigTensorboardSectionIsDefined() throws Exception {
RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
-
exception.expect(YamlParseException.class);
exception.expectMessage("TensorBoard section should not be defined " +
- "when PyTorch is the selected framework");
+ "when TensorFlow is not the selected framework!");
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
DIR_NAME + "/invalid-config-tensorboard-section.yaml");
runJobCli.run(
new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
}
-
-}
+}
\ No newline at end of file
diff --git
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchTest.java
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchTest.java
index 778c190..b1ec19b 100644
---
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchTest.java
+++
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchTest.java
@@ -208,4 +208,75 @@ public class RunJobCliParsingPyTorchTest {
"--tensorboard_docker_image", "TBDockerImage"});
}
+ @Test
+ public void testNumSchedulerCannotBeDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+
+ expectedException.expect(ParseException.class);
+ expectedException.expectMessage("cannot be defined for PyTorch jobs");
+ runJobCli.run(
+ new String[]{"--framework", "pytorch",
+ "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+ "--input_path", "hdfs://input",
+ "--checkpoint_path", "hdfs://output",
+ "--num_workers", "3",
+ "--worker_launch_cmd",
+ "python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
+ "--num_schedulers", "1"});
+ }
+
+ @Test
+ public void testSchedulerResourcesCannotBeDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+
+ expectedException.expect(ParseException.class);
+ expectedException.expectMessage("cannot be defined for PyTorch jobs");
+ runJobCli.run(
+ new String[]{"--framework", "pytorch",
+ "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+ "--input_path", "hdfs://input",
+ "--checkpoint_path", "hdfs://output",
+ "--num_workers", "3",
+ "--worker_launch_cmd", "python run-job.py",
+ "--worker_resources", "memory=2048M,vcores=2",
+ "--scheduler_resources", "memory=2048M,vcores=2"});
+ }
+
+ @Test
+ public void testSchedulerDockerImageCannotBeDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+
+ expectedException.expect(ParseException.class);
+ expectedException.expectMessage("cannot be defined for PyTorch jobs");
+ runJobCli.run(
+ new String[]{"--framework", "pytorch",
+ "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+ "--input_path", "hdfs://input",
+ "--checkpoint_path", "hdfs://output",
+ "--num_workers", "3",
+ "--worker_launch_cmd", "python run-job.py",
+ "--worker_resources", "memory=2048M,vcores=2",
+ "--scheduler_docker_image", "schedulerDockerImage"});
+ }
+
+ @Test
+ public void testSchedulerLaunchCommandCannotBeDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+
+ expectedException.expect(ParseException.class);
+ expectedException.expectMessage("cannot be defined for PyTorch jobs");
+ runJobCli.run(
+ new String[]{"--framework", "pytorch",
+ "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+ "--input_path", "hdfs://input",
+ "--checkpoint_path", "hdfs://output",
+ "--num_workers", "3",
+ "--worker_launch_cmd", "python run-job.py",
+ "--worker_resources", "memory=2048M,vcores=2",
+ "--scheduler_launch_cmd", "schedulerLaunchCommand"});
+ }
}
diff --git
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
index 6c9c5b8..2ab075f 100644
---
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
+++
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
@@ -249,11 +249,23 @@ public class RunJobCliParsingPyTorchYamlTest {
exception.expect(YamlParseException.class);
exception.expectMessage("TensorBoard section should not be defined " +
- "when PyTorch is the selected framework");
+ "when TensorFlow is not the selected framework!");
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
DIR_NAME + "/invalid-config-tensorboard-section.yaml");
runJobCli.run(
new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
}
+ @Test
+ public void testInvalidConfigSchedulerSectionIsDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+
+ exception.expect(YamlParseException.class);
+ exception.expectMessage("Scheduler section should not be defined " +
+ "when MXNet is not the selected framework!");
+ yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
+ DIR_NAME + "/invalid-config-scheduler-section.yaml");
+ runJobCli.run(
+ new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
+ }
}
diff --git
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowTest.java
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowTest.java
index 0947b7b..1dfaf42 100644
---
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowTest.java
+++
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowTest.java
@@ -169,4 +169,71 @@ public class RunJobCliParsingTensorFlowTest {
}
assertTrue(success);
}
+
+ @Test
+ public void testNumSchedulerCannotBeDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+
+ expectedException.expect(ParseException.class);
+ expectedException.expectMessage("cannot be defined for TensorFlow jobs");
+ runJobCli.run(
+ new String[] {"--framework", "tensorflow",
+ "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+ "--input_path", "hdfs://input", "--checkpoint_path",
"hdfs://output",
+ "--num_workers", "1", "--worker_launch_cmd", "python run-job.py",
+ "--worker_resources", "memory=4g,vcores=2", "--tensorboard",
"true",
+ "--verbose", "--wait_job_finish", "--num_schedulers", "1"});
+ }
+
+ @Test
+ public void testSchedulerResourcesCannotBeDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+
+ expectedException.expect(ParseException.class);
+ expectedException.expectMessage("cannot be defined for TensorFlow jobs");
+ runJobCli.run(
+ new String[] {"--framework", "tensorflow",
+ "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+ "--input_path", "hdfs://input", "--checkpoint_path",
"hdfs://output",
+ "--num_workers", "1", "--worker_launch_cmd", "python run-job.py",
+ "--worker_resources", "memory=4g,vcores=2", "--tensorboard",
"true",
+ "--verbose", "--wait_job_finish",
+ "--scheduler_resources", "memory=2048M,vcores=2"});
+ }
+
+ @Test
+ public void testSchedulerDockerImageCannotBeDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+
+ expectedException.expect(ParseException.class);
+ expectedException.expectMessage("cannot be defined for TensorFlow jobs");
+ runJobCli.run(
+ new String[] {"--framework", "tensorflow",
+ "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+ "--input_path", "hdfs://input", "--checkpoint_path",
"hdfs://output",
+ "--num_workers", "1", "--worker_launch_cmd", "python run-job.py",
+ "--worker_resources", "memory=4g,vcores=2", "--tensorboard",
"true",
+ "--verbose", "--wait_job_finish",
+ "--scheduler_docker_image", "schedulerDockerImage"});
+ }
+
+ @Test
+ public void testSchedulerLaunchCommandCannotBeDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+ assertFalse(SubmarineLogs.isVerbose());
+
+ expectedException.expect(ParseException.class);
+ expectedException.expectMessage("cannot be defined for TensorFlow jobs");
+ runJobCli.run(
+ new String[] {"--framework", "tensorflow",
+ "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+ "--input_path", "hdfs://input", "--checkpoint_path",
"hdfs://output",
+ "--num_workers", "1", "--worker_launch_cmd", "python run-job.py",
+ "--worker_resources", "memory=4g,vcores=2", "--tensorboard",
"true",
+ "--verbose", "--wait_job_finish",
+ "--scheduler_launch_cmd", "schedulerLaunchCommand"});
+ }
}
diff --git
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowYamlTest.java
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowYamlTest.java
index c4ddd0f..894fc08 100644
---
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowYamlTest.java
+++
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowYamlTest.java
@@ -24,6 +24,7 @@ import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.submarine.client.cli.YamlConfigTestUtils;
import org.apache.submarine.client.cli.param.runjob.RunJobParameters;
import org.apache.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
+import org.apache.submarine.client.cli.param.yaml.YamlParseException;
import org.apache.submarine.client.cli.runjob.RunJobCli;
import org.apache.submarine.commons.runtime.conf.SubmarineLogs;
import
org.apache.submarine.commons.runtime.exception.SubmarineRuntimeException;
@@ -292,4 +293,16 @@ public class RunJobCliParsingTensorFlowYamlTest {
verifyTensorboardValues(jobRunParameters);
}
+ @Test
+ public void testInvalidConfigSchedulerSectionIsDefined() throws Exception {
+ RunJobCli runJobCli = new
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+
+ exception.expect(YamlParseException.class);
+ exception.expectMessage("Scheduler section should not be defined " +
+ "when MXNet is not the selected framework!");
+ yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
+ DIR_NAME + "/invalid-config-scheduler-section.yaml");
+ runJobCli.run(
+ new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
+ }
}
diff --git
a/submarine-client/src/test/resources/runjob-mxnet-yaml/envs-are-missing.yaml
b/submarine-client/src/test/resources/runjob-mxnet-yaml/envs-are-missing.yaml
new file mode 100644
index 0000000..51998f8
--- /dev/null
+++
b/submarine-client/src/test/resources/runjob-mxnet-yaml/envs-are-missing.yaml
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+ name: testJobName
+ job_type: testJobType
+ framework: mxnet
+
+configs:
+ input_path: testInputPath
+ checkpoint_path: testCheckpointPath
+ saved_model_path: testSavedModelPath
+ docker_image: testDockerImage
+ wait_job_finish: true
+ localizations:
+ - hdfs://remote-file1:/local-filename1:rw
+ - nfs://remote-file2:/local-filename2:rw
+ mounts:
+ - /etc/passwd:/etc/passwd:rw
+ - /etc/hosts:/etc/hosts:rw
+ quicklinks:
+ - Notebook_UI=https://master-0:7070
+ - Notebook_UI2=https://master-0:7071
+
+scheduling:
+ queue: queue1
+
+roles:
+ worker:
+ resources: memory=20480M,vcores=32
+ replicas: 3
+ launch_cmd: testLaunchCmdWorker
+ docker_image: testDockerImageWorker
+ ps:
+ resources: memory=20500M,vcores=34
+ replicas: 4
+ launch_cmd: testLaunchCmdPs
+ docker_image: testDockerImagePs
+ scheduler:
+ resources: memory=10240M,vcores=16
+ replicas: 1
+ launch_cmd: testLaunchCmdScheduler
+ docker_image: testDockerImageScheduler
+
+security:
+ keytab: keytabPath
+ principal: testPrincipal
+ distribute_keytab: true
\ No newline at end of file
diff --git
a/submarine-client/src/test/resources/runjob-mxnet-yaml/invalid-config-tensorboard-section.yaml
b/submarine-client/src/test/resources/runjob-mxnet-yaml/invalid-config-tensorboard-section.yaml
new file mode 100644
index 0000000..747f373
--- /dev/null
+++
b/submarine-client/src/test/resources/runjob-mxnet-yaml/invalid-config-tensorboard-section.yaml
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+ name: testJobName
+ job_type: testJobType
+ framework: mxnet
+
+configs:
+ input_path: testInputPath
+ checkpoint_path: testCheckpointPath
+ saved_model_path: testSavedModelPath
+ docker_image: testDockerImage
+ wait_job_finish: true
+ envs:
+ env1: 'env1Value'
+ env2: 'env2Value'
+ localizations:
+ - hdfs://remote-file1:/local-filename1:rw
+ - nfs://remote-file2:/local-filename2:rw
+ mounts:
+ - /etc/passwd:/etc/passwd:rw
+ - /etc/hosts:/etc/hosts:rw
+ quicklinks:
+ - Notebook_UI=https://master-0:7070
+ - Notebook_UI2=https://master-0:7071
+
+scheduling:
+ queue: queue1
+
+roles:
+ worker:
+ resources: memory=20480M,vcores=32
+ replicas: 3
+ launch_cmd: testLaunchCmdWorker
+ docker_image: testDockerImageWorker
+ ps:
+ resources: memory=20500M,vcores=34
+ replicas: 4
+ launch_cmd: testLaunchCmdPs
+ docker_image: testDockerImagePs
+ scheduler:
+ resources: memory=10240M,vcores=16
+ replicas: 1
+ launch_cmd: testLaunchCmdScheduler
+ docker_image: testDockerImageScheduler
+
+security:
+ keytab: keytabPath
+ principal: testPrincipal
+ distribute_keytab: true
+
+tensorBoard:
+ docker_image: tensorboardDockerImage
\ No newline at end of file
diff --git
a/submarine-client/src/test/resources/runjob-mxnet-yaml/security-principal-is-missing.yaml
b/submarine-client/src/test/resources/runjob-mxnet-yaml/security-principal-is-missing.yaml
new file mode 100644
index 0000000..fe2d1f2
--- /dev/null
+++
b/submarine-client/src/test/resources/runjob-mxnet-yaml/security-principal-is-missing.yaml
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+ name: testJobName
+ job_type: testJobType
+ framework: mxnet
+
+configs:
+ input_path: testInputPath
+ checkpoint_path: testCheckpointPath
+ saved_model_path: testSavedModelPath
+ docker_image: testDockerImage
+ wait_job_finish: true
+ envs:
+ env1: 'env1Value'
+ env2: 'env2Value'
+ localizations:
+ - hdfs://remote-file1:/local-filename1:rw
+ - nfs://remote-file2:/local-filename2:rw
+ mounts:
+ - /etc/passwd:/etc/passwd:rw
+ - /etc/hosts:/etc/hosts:rw
+ quicklinks:
+ - Notebook_UI=https://master-0:7070
+ - Notebook_UI2=https://master-0:7071
+
+scheduling:
+ queue: queue1
+
+roles:
+ worker:
+ resources: memory=20480M,vcores=32
+ replicas: 3
+ launch_cmd: testLaunchCmdWorker
+ docker_image: testDockerImageWorker
+ ps:
+ resources: memory=20500M,vcores=34
+ replicas: 4
+ launch_cmd: testLaunchCmdPs
+ docker_image: testDockerImagePs
+ scheduler:
+ resources: memory=10240M,vcores=16
+ replicas: 1
+ launch_cmd: testLaunchCmdScheduler
+ docker_image: testDockerImageScheduler
+
+security:
+ keytab: keytabPath
+ distribute_keytab: true
\ No newline at end of file
diff --git
a/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-config-with-overrides.yaml
b/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-config-with-overrides.yaml
new file mode 100644
index 0000000..345a897
--- /dev/null
+++
b/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-config-with-overrides.yaml
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+ name: testJobName
+ job_type: testJobType
+ framework: mxnet
+
+configs:
+ input_path: testInputPath
+ checkpoint_path: testCheckpointPath
+ saved_model_path: testSavedModelPath
+ docker_image: testDockerImage
+ wait_job_finish: true
+ envs:
+ env1: 'env1Value'
+ env2: 'env2Value'
+ localizations:
+ - hdfs://remote-file1:/local-filename1:rw
+ - nfs://remote-file2:/local-filename2:rw
+ mounts:
+ - /etc/passwd:/etc/passwd:rw
+ - /etc/hosts:/etc/hosts:rw
+ quicklinks:
+ - Notebook_UI=https://master-0:7070
+ - Notebook_UI2=https://master-0:7071
+
+scheduling:
+ queue: queue1
+
+roles:
+ worker:
+ resources: memory=20480M,vcores=32
+ replicas: 3
+ launch_cmd: overridden_testLaunchCmdWorker
+ docker_image: overridden_testDockerImageWorker
+ envs:
+ env1: 'overridden_env1Worker'
+ env2: 'overridden_env2Worker'
+ localizations:
+ - hdfs://remote-file1:/overridden_local-filename1Worker:rw
+ - nfs://remote-file2:/overridden_local-filename2Worker:rw
+ mounts:
+ - /etc/passwd:/overridden_Worker
+ - /etc/hosts:/overridden_Worker
+ ps:
+ resources: memory=20500M,vcores=34
+ replicas: 4
+ launch_cmd: overridden_testLaunchCmdPs
+ docker_image: overridden_testDockerImagePs
+ envs:
+ env1: 'overridden_env1Ps'
+ env2: 'overridden_env2Ps'
+ localizations:
+ - hdfs://remote-file1:/overridden_local-filename1Ps:rw
+ - nfs://remote-file2:/overridden_local-filename2Ps:rw
+ mounts:
+ - /etc/passwd:/overridden_Ps
+ - /etc/hosts:/overridden_Ps
+ scheduler:
+ resources: memory=10240M,vcores=16
+ replicas: 1
+ launch_cmd: overridden_testLaunchCmdScheduler
+ docker_image: overridden_testDockerImageScheduler
+ envs:
+ env1: 'overridden_env1Scheduler'
+ env2: 'overridden_env2Scheduler'
+ localizations:
+ - hdfs://remote-file1:/overridden_local-filename1Scheduler:rw
+ - nfs://remote-file2:/overridden_local-filename2Scheduler:rw
+ mounts:
+ - /etc/passwd:/overridden_Scheduler
+ - /etc/hosts:/overridden_Scheduler
+
+security:
+ keytab: keytabPath
+ principal: testPrincipal
+ distribute_keytab: true
\ No newline at end of file
diff --git
a/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-config.yaml
b/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-config.yaml
new file mode 100644
index 0000000..23a5dbc
--- /dev/null
+++ b/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-config.yaml
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+ name: testJobName
+ job_type: testJobType
+ framework: mxnet
+
+configs:
+ input_path: testInputPath
+ checkpoint_path: testCheckpointPath
+ saved_model_path: testSavedModelPath
+ docker_image: testDockerImage
+ wait_job_finish: true
+ envs:
+ env1: 'env1Value'
+ env2: 'env2Value'
+ localizations:
+ - hdfs://remote-file1:/local-filename1:rw
+ - nfs://remote-file2:/local-filename2:rw
+ mounts:
+ - /etc/passwd:/etc/passwd:rw
+ - /etc/hosts:/etc/hosts:rw
+ quicklinks:
+ - Notebook_UI=https://master-0:7070
+ - Notebook_UI2=https://master-0:7071
+
+scheduling:
+ queue: queue1
+
+roles:
+ worker:
+ resources: memory=20480M,vcores=32
+ replicas: 3
+ launch_cmd: testLaunchCmdWorker
+ docker_image: testDockerImageWorker
+ ps:
+ resources: memory=20500M,vcores=34
+ replicas: 4
+ launch_cmd: testLaunchCmdPs
+ docker_image: testDockerImagePs
+ scheduler:
+ resources: memory=10240M,vcores=16
+ replicas: 1
+ launch_cmd: testLaunchCmdScheduler
+ docker_image: testDockerImageScheduler
+
+security:
+ keytab: keytabPath
+ principal: testPrincipal
+ distribute_keytab: true
\ No newline at end of file
diff --git
a/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-gpu-config.yaml
b/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-gpu-config.yaml
new file mode 100644
index 0000000..50a27d4
--- /dev/null
+++
b/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-gpu-config.yaml
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+ name: testJobName
+ job_type: testJobType
+ framework: mxnet
+
+configs:
+ input_path: testInputPath
+ checkpoint_path: testCheckpointPath
+ saved_model_path: testSavedModelPath
+ docker_image: testDockerImage
+ wait_job_finish: true
+ envs:
+ env1: 'env1Value'
+ env2: 'env2Value'
+ localizations:
+ - hdfs://remote-file1:/local-filename1:rw
+ - nfs://remote-file2:/local-filename2:rw
+ mounts:
+ - /etc/passwd:/etc/passwd:rw
+ - /etc/hosts:/etc/hosts:rw
+ quicklinks:
+ - Notebook_UI=https://master-0:7070
+ - Notebook_UI2=https://master-0:7071
+
+scheduling:
+ queue: queue1
+
+roles:
+ worker:
+ resources: memory=20480M,vcores=32,gpu=2
+ replicas: 3
+ launch_cmd: testLaunchCmdWorker
+ docker_image: testDockerImageWorker
+ ps:
+ resources: memory=20500M,vcores=34
+ replicas: 4
+ launch_cmd: testLaunchCmdPs
+ docker_image: testDockerImagePs
+ scheduler:
+ resources: memory=10240M,vcores=16
+ replicas: 1
+ launch_cmd: testLaunchCmdScheduler
+ docker_image: testDockerImageScheduler
+
+security:
+ keytab: keytabPath
+ principal: testPrincipal
+ distribute_keytab: true
\ No newline at end of file
diff --git
a/submarine-client/src/test/resources/runjob-pytorch-yaml/invalid-config-scheduler-section.yaml
b/submarine-client/src/test/resources/runjob-pytorch-yaml/invalid-config-scheduler-section.yaml
new file mode 100644
index 0000000..3a8e013
--- /dev/null
+++
b/submarine-client/src/test/resources/runjob-pytorch-yaml/invalid-config-scheduler-section.yaml
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+ name: testJobName
+ job_type: testJobType
+ framework: pytorch
+
+configs:
+ input_path: testInputPath
+ checkpoint_path: testCheckpointPath
+ saved_model_path: testSavedModelPath
+ docker_image: testDockerImage
+ wait_job_finish: true
+ envs:
+ env1: 'env1Value'
+ env2: 'env2Value'
+ localizations:
+ - hdfs://remote-file1:/local-filename1:rw
+ - nfs://remote-file2:/local-filename2:rw
+ mounts:
+ - /etc/passwd:/etc/passwd:rw
+ - /etc/hosts:/etc/hosts:rw
+ quicklinks:
+ - Notebook_UI=https://master-0:7070
+ - Notebook_UI2=https://master-0:7071
+
+scheduling:
+ queue: queue1
+
+roles:
+ worker:
+ resources: memory=20480M,vcores=32
+ replicas: 2
+ launch_cmd: testLaunchCmdWorker
+ docker_image: testDockerImageWorker
+ scheduler:
+ docker_image: testDockerImageScheduler
+
+security:
+ keytab: keytabPath
+ principal: testPrincipal
+ distribute_keytab: true
\ No newline at end of file
diff --git
a/submarine-client/src/test/resources/runjob-tensorflow-yaml/invalid-config-scheduler-section.yaml
b/submarine-client/src/test/resources/runjob-tensorflow-yaml/invalid-config-scheduler-section.yaml
new file mode 100644
index 0000000..f3df4dd
--- /dev/null
+++
b/submarine-client/src/test/resources/runjob-tensorflow-yaml/invalid-config-scheduler-section.yaml
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+ name: testJobName
+ job_type: testJobType
+ framework: tensorflow
+
+configs:
+ input_path: testInputPath
+ checkpoint_path: testCheckpointPath
+ saved_model_path: testSavedModelPath
+ docker_image: testDockerImage
+ wait_job_finish: true
+ envs:
+ env1: 'env1Value'
+ env2: 'env2Value'
+ localizations:
+ - hdfs://remote-file1:/local-filename1:rw
+ - nfs://remote-file2:/local-filename2:rw
+ mounts:
+ - /etc/passwd:/etc/passwd:rw
+ - /etc/hosts:/etc/hosts:rw
+ quicklinks:
+ - Notebook_UI=https://master-0:7070
+ - Notebook_UI2=https://master-0:7071
+
+scheduling:
+ queue: queue1
+
+roles:
+ worker:
+ resources: memory=20480M,vcores=32
+ replicas: 2
+ launch_cmd: testLaunchCmdWorker
+ docker_image: testDockerImageWorker
+ ps:
+ resources: memory=20500M,vcores=34
+ replicas: 2
+ launch_cmd: testLaunchCmdPs
+ docker_image: testDockerImagePs
+ scheduler:
+ docker_image: testDockerImageScheduler
+
+security:
+ keytab: keytabPath
+ principal: testPrincipal
+ distribute_keytab: true
\ No newline at end of file
diff --git
a/submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/Framework.java
b/submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/Framework.java
index efe974b..1ca1a96 100644
---
a/submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/Framework.java
+++
b/submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/Framework.java
@@ -28,7 +28,7 @@ import java.util.stream.Collectors;
* Represents the type of Machine learning framework to work with.
*/
public enum Framework {
- TENSORFLOW(Constants.TENSORFLOW_NAME), PYTORCH(Constants.PYTORCH_NAME);
+ TENSORFLOW(Constants.TENSORFLOW_NAME), PYTORCH(Constants.PYTORCH_NAME),
MXNET(Constants.MXNET_NAME);
private String value;
@@ -58,5 +58,6 @@ public enum Framework {
private static class Constants {
static final String TENSORFLOW_NAME = "tensorflow";
static final String PYTORCH_NAME = "pytorch";
+ static final String MXNET_NAME = "mxnet";
}
}
diff --git
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
b/submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/api/MXNetRole.java
similarity index 66%
copy from
submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
copy to
submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/api/MXNetRole.java
index fab6f31..5c635ce 100644
---
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
+++
b/submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/api/MXNetRole.java
@@ -17,28 +17,30 @@
* under the License.
*/
-package org.apache.submarine.client.cli.param.yaml;
+package org.apache.submarine.commons.runtime.api;
/**
- * This class represents a section of the YAML configuration file.
+ * Enum to represent a MXNet Role.
*/
-public class Roles {
- private Role worker;
- private Role ps;
+public enum MXNetRole implements Role {
+ PRIMARY_WORKER("master"),
+ WORKER("worker"),
+ PS("ps"),
+ SCHEDULER("scheduler");
- public Role getWorker() {
- return worker;
- }
+ private String compName;
- public void setWorker(Role worker) {
- this.worker = worker;
+ MXNetRole(String compName) {
+ this.compName = compName;
}
- public Role getPs() {
- return ps;
+ @Override
+ public String getComponentName() {
+ return compName;
}
- public void setPs(Role ps) {
- this.ps = ps;
+ @Override
+ public String getName() {
+ return name();
}
}
diff --git
a/submarine-server/server-submitter/submitter-yarn/src/main/java/org/apache/submarine/server/submitter/yarn/YarnUtils.java
b/submarine-server/server-submitter/submitter-yarn/src/main/java/org/apache/submarine/server/submitter/yarn/YarnUtils.java
index 07a26ed..1b11d5c 100644
---
a/submarine-server/server-submitter/submitter-yarn/src/main/java/org/apache/submarine/server/submitter/yarn/YarnUtils.java
+++
b/submarine-server/server-submitter/submitter-yarn/src/main/java/org/apache/submarine/server/submitter/yarn/YarnUtils.java
@@ -31,6 +31,7 @@ import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.submarine.client.cli.CliConstants;
import org.apache.submarine.client.cli.param.Localization;
import org.apache.submarine.client.cli.param.ParametersHolder;
+import org.apache.submarine.commons.runtime.Framework;
import org.apache.submarine.commons.runtime.param.Parameter;
import org.apache.submarine.commons.runtime.resource.ResourceUtils;
@@ -54,67 +55,24 @@ public final class YarnUtils {
parameters.getFramework().getValue());
tonyConf.setStrings(TonyConfigurationKeys.APPLICATION_NAME,
parameters.getParameters().getName());
- tonyConf.setStrings(
- TonyConfigurationKeys.getInstancesKey(Constants.WORKER_JOB_NAME),
- parameters.getOptionValue(CliConstants.N_WORKERS));
- if (parameters.getOptionValue(CliConstants.N_PS) != null) {
- tonyConf.setStrings(
- TonyConfigurationKeys.getInstancesKey(Constants.PS_JOB_NAME),
- parameters.getOptionValue(CliConstants.N_PS));
- }
- // Resources for PS & Worker
- if (parameters.getOptionValue(CliConstants.PS_RES) != null) {
- Resource psResource = getResource(parameters, CliConstants.PS_RES);
- tonyConf.setInt(
- TonyConfigurationKeys.getResourceKey(Constants.PS_JOB_NAME,
- Constants.VCORES),
- psResource.getVirtualCores());
- tonyConf.setLong(
- TonyConfigurationKeys.getResourceKey(Constants.PS_JOB_NAME,
- Constants.MEMORY),
- ResourceUtils.getMemorySize(psResource));
+ setParametersForWorker(tonyConf, parameters);
+ setParametersForPS(tonyConf, parameters);
+ if (parameters.getFramework() == Framework.MXNET) {
+ setParametersForScheduler(tonyConf, parameters);
}
- if (parameters.getOptionValue(CliConstants.WORKER_RES) != null) {
- Resource workerResource = getResource(parameters,
CliConstants.WORKER_RES);
- tonyConf.setInt(
- TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
- Constants.VCORES),
- workerResource.getVirtualCores());
- tonyConf.setLong(
- TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
- Constants.MEMORY),
- ResourceUtils.getMemorySize(workerResource));
- tonyConf.setLong(
- TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
- Constants.GPUS),
- ResourceUtils.getResourceValue(workerResource,
- ResourceUtils.GPU_URI));
- }
if (parameters.getOptionValue(CliConstants.QUEUE) != null) {
tonyConf.set(
TonyConfigurationKeys.YARN_QUEUE_NAME,
parameters.getOptionValue(CliConstants.QUEUE));
}
- // Set up Docker for PS & Worker
+
if (parameters.getOptionValue(CliConstants.DOCKER_IMAGE) != null) {
tonyConf.set(TonyConfigurationKeys.getContainerDockerKey(),
parameters.getOptionValue(CliConstants.DOCKER_IMAGE));
tonyConf.setBoolean(TonyConfigurationKeys.DOCKER_ENABLED, true);
}
- if (parameters.getOptionValue(CliConstants.WORKER_DOCKER_IMAGE) != null) {
- tonyConf.set(
- TonyConfigurationKeys.getDockerImageKey(Constants.WORKER_JOB_NAME),
- parameters.getOptionValue(CliConstants.WORKER_DOCKER_IMAGE));
- tonyConf.setBoolean(TonyConfigurationKeys.DOCKER_ENABLED, true);
- }
- if (parameters.getOptionValue(CliConstants.PS_DOCKER_IMAGE) != null) {
- tonyConf.set(
- TonyConfigurationKeys.getDockerImageKey(Constants.PS_JOB_NAME),
- parameters.getOptionValue(CliConstants.PS_DOCKER_IMAGE));
- tonyConf.setBoolean(TonyConfigurationKeys.DOCKER_ENABLED, true);
- }
// Set up container environment
if (parameters.getOptionValues(CliConstants.ENV) != null) {
@@ -132,18 +90,6 @@ public final class YarnUtils {
}
// Update after SUBMARINE-104 is merged into tony.
// tonyConf.setStrings(TonyConfigurationKeys.APPLICATION_TYPE,
SUBMARINE_RUNTIME_APP_TYPE);
- // Set up running command
- if (parameters.getOptionValue(CliConstants.WORKER_LAUNCH_CMD) != null) {
- tonyConf.set(
-
TonyConfigurationKeys.getExecuteCommandKey(Constants.WORKER_JOB_NAME),
- parameters.getOptionValue(CliConstants.WORKER_LAUNCH_CMD));
- }
-
- if (parameters.getOptionValue(CliConstants.PS_LAUNCH_CMD) != null) {
- tonyConf.set(
- TonyConfigurationKeys.getExecuteCommandKey(Constants.PS_JOB_NAME),
- parameters.getOptionValue(CliConstants.PS_LAUNCH_CMD));
- }
tonyConf.setBoolean(TonyConfigurationKeys.SECURITY_ENABLED,
!parameters.hasOption(CliConstants.INSECURE_CLUSTER));
@@ -203,4 +149,113 @@ public final class YarnUtils {
}
return ResourceUtils.createResourceFromString(resourceStr);
}
+
+ private static void setParametersForWorker (Configuration tonyConf,
+ ParametersHolder parameters) throws YarnException, ParseException {
+ tonyConf.setStrings(
+ TonyConfigurationKeys.getInstancesKey(Constants.WORKER_JOB_NAME),
+ parameters.getOptionValue(CliConstants.N_WORKERS));
+
+ if (parameters.getOptionValue(CliConstants.WORKER_RES) != null) {
+ Resource workerResource = getResource(parameters,
CliConstants.WORKER_RES);
+
+ tonyConf.setInt(
+ TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
+ Constants.VCORES),
+ workerResource.getVirtualCores());
+ tonyConf.setLong(
+ TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
+ Constants.MEMORY),
+ ResourceUtils.getMemorySize(workerResource));
+ tonyConf.setLong(
+ TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
+ Constants.GPUS),
+ ResourceUtils.getResourceValue(workerResource,
+ ResourceUtils.GPU_URI));
+ }
+
+ if (parameters.getOptionValue(CliConstants.WORKER_DOCKER_IMAGE) != null) {
+ tonyConf.set(
+
TonyConfigurationKeys.getDockerImageKey(Constants.WORKER_JOB_NAME),
+ parameters.getOptionValue(CliConstants.WORKER_DOCKER_IMAGE));
+ tonyConf.setBoolean(TonyConfigurationKeys.DOCKER_ENABLED, true);
+ }
+
+ if (parameters.getOptionValue(CliConstants.WORKER_LAUNCH_CMD) != null) {
+ tonyConf.set(
+
TonyConfigurationKeys.getExecuteCommandKey(Constants.WORKER_JOB_NAME),
+ parameters.getOptionValue(CliConstants.WORKER_LAUNCH_CMD));
+ }
+ }
+
+ private static void setParametersForPS (Configuration tonyConf,
+ ParametersHolder parameters) throws YarnException, ParseException {
+ String jobName = Constants.PS_JOB_NAME;
+ if (parameters.getFramework() == Framework.MXNET) {
+ jobName = Constants.SERVER_JOB_NAME;
+ }
+
+ if (parameters.getOptionValue(CliConstants.N_PS) != null) {
+ tonyConf.setStrings(
+ TonyConfigurationKeys.getInstancesKey(jobName),
+ parameters.getOptionValue(CliConstants.N_PS));
+ }
+ if (parameters.getOptionValue(CliConstants.PS_RES) != null) {
+ Resource psResource = getResource(parameters, CliConstants.PS_RES);
+
+ tonyConf.setInt(
+ TonyConfigurationKeys.getResourceKey(jobName,
+ Constants.VCORES),
+ psResource.getVirtualCores());
+ tonyConf.setLong(
+ TonyConfigurationKeys.getResourceKey(jobName,
+ Constants.MEMORY),
+ ResourceUtils.getMemorySize(psResource));
+ }
+
+ if (parameters.getOptionValue(CliConstants.PS_LAUNCH_CMD) != null) {
+ tonyConf.set(
+ TonyConfigurationKeys.getExecuteCommandKey(jobName),
+ parameters.getOptionValue(CliConstants.PS_LAUNCH_CMD));
+ }
+
+ if (parameters.getOptionValue(CliConstants.PS_DOCKER_IMAGE) != null) {
+ tonyConf.set(
+ TonyConfigurationKeys.getDockerImageKey(jobName),
+ parameters.getOptionValue(CliConstants.PS_DOCKER_IMAGE));
+ tonyConf.setBoolean(TonyConfigurationKeys.DOCKER_ENABLED, true);
+ }
+ }
+
+ private static void setParametersForScheduler (Configuration tonyConf,
+ ParametersHolder parameters) throws YarnException, ParseException {
+ if (parameters.getOptionValue(CliConstants.N_SCHEDULERS) != null) {
+ tonyConf.setStrings(
+
TonyConfigurationKeys.getInstancesKey(Constants.SCHEDULER_JOB_NAME),
+ parameters.getOptionValue(CliConstants.N_SCHEDULERS));
+ }
+
+ if (parameters.getOptionValue(CliConstants.SCHEDULER_RES) != null) {
+ Resource schedulerResource = getResource(parameters,
CliConstants.SCHEDULER_RES);
+
+ tonyConf.setInt(
+
TonyConfigurationKeys.getResourceKey(Constants.SCHEDULER_JOB_NAME,
+ Constants.VCORES),
+ schedulerResource.getVirtualCores());
+ tonyConf.setLong(
+
TonyConfigurationKeys.getResourceKey(Constants.SCHEDULER_JOB_NAME,
+ Constants.MEMORY),
+ ResourceUtils.getMemorySize(schedulerResource));
+ }
+ if (parameters.getOptionValue(CliConstants.SCHEDULER_LAUNCH_CMD) != null) {
+ tonyConf.set(
+
TonyConfigurationKeys.getExecuteCommandKey(Constants.SCHEDULER_JOB_NAME),
+ parameters.getOptionValue(CliConstants.SCHEDULER_LAUNCH_CMD));
+ }
+ if (parameters.getOptionValue(CliConstants.SCHEDULER_DOCKER_IMAGE) !=
null) {
+ tonyConf.set(
+
TonyConfigurationKeys.getDockerImageKey(Constants.SCHEDULER_JOB_NAME),
+ parameters.getOptionValue(CliConstants.SCHEDULER_DOCKER_IMAGE));
+ }
+ }
}
diff --git
a/submarine-server/server-submitter/submitter-yarn/src/test/java/YarnUtilsTest.java
b/submarine-server/server-submitter/submitter-yarn/src/test/java/YarnUtilsTest.java
index e919e41..f360abe 100644
---
a/submarine-server/server-submitter/submitter-yarn/src/test/java/YarnUtilsTest.java
+++
b/submarine-server/server-submitter/submitter-yarn/src/test/java/YarnUtilsTest.java
@@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.submarine.client.cli.param.ParametersHolder;
+import org.apache.submarine.client.cli.param.runjob.MXNetRunJobParameters;
import org.apache.submarine.client.cli.param.runjob.PyTorchRunJobParameters;
import org.apache.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
import org.apache.submarine.client.cli.runjob.RunJobCli;
@@ -159,4 +160,62 @@ public class YarnUtilsTest {
// Update after SUBMARINE-104 is merged into tony.
// Assert.assertEquals("SUBMARINE",
tonyConf.get(TonyConfigurationKeys.APPLICATION_TYPE));
}
+
+ @Test
+ public void testMXNetTonyConfFromClientContext() throws Exception {
+ RunJobCli runJobCli = new RunJobCli(getMockClientContext());
+ runJobCli.run(
+ new String[] {"--framework", "mxnet", "--name", "my-job",
+ "--docker_image", "mxnet-docker:1.1.0",
+ "--input_path", "hdfs://input",
+ "--num_workers", "2", "--num_ps", "2", "--worker_launch_cmd",
+ "python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
+ "--ps_resources", "memory=4G,vcores=4", "--ps_launch_cmd",
+ "python run-ps.py", "--num_schedulers", "1",
"--scheduler_launch_cmd",
+ "python run-scheduler.py", "--scheduler_resources",
"memory=2048M,vcores=2"});
+ RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
+ ParametersHolder parametersHolder = runJobCli.getParametersHolder();
+
+ assertTrue(RunJobParameters.class + " must be an instance of " +
+ MXNetRunJobParameters.class,
+ jobRunParameters instanceof MXNetRunJobParameters);
+ MXNetRunJobParameters mxNetParams = (MXNetRunJobParameters)
jobRunParameters;
+
+ Configuration tonyConf = YarnUtils
+ .tonyConfFromClientContext(parametersHolder);
+ Assert.assertEquals(parametersHolder.getFramework().getValue(),
+ tonyConf.get(TonyConfigurationKeys.FRAMEWORK_NAME));
+ Assert.assertEquals(jobRunParameters.getName(),
+ tonyConf.get(TonyConfigurationKeys.APPLICATION_NAME));
+ Assert.assertEquals(jobRunParameters.getDockerImageName(),
+ tonyConf.get(TonyConfigurationKeys.getContainerDockerKey()));
+
+ Assert.assertEquals("2", tonyConf.get(TonyConfigurationKeys
+ .getInstancesKey("worker")));
+ Assert.assertEquals(mxNetParams.getWorkerLaunchCmd(),
+ tonyConf.get(TonyConfigurationKeys
+ .getExecuteCommandKey("worker")));
+ Assert.assertEquals("2048", tonyConf.get(TonyConfigurationKeys
+ .getResourceKey(Constants.WORKER_JOB_NAME, Constants.MEMORY)));
+ Assert.assertEquals("2", tonyConf.get(TonyConfigurationKeys
+ .getResourceKey(Constants.WORKER_JOB_NAME, Constants.VCORES)));
+ Assert.assertEquals("2",
+
tonyConf.get(TonyConfigurationKeys.getInstancesKey(Constants.SERVER_JOB_NAME)));
+ Assert.assertEquals("4096", tonyConf.get(TonyConfigurationKeys
+ .getResourceKey(Constants.SERVER_JOB_NAME, Constants.MEMORY)));
+ Assert.assertEquals("4", tonyConf.get(TonyConfigurationKeys
+ .getResourceKey(Constants.SERVER_JOB_NAME, Constants.VCORES)));
+ Assert.assertEquals(mxNetParams.getPSLaunchCmd(),
+
tonyConf.get(TonyConfigurationKeys.getExecuteCommandKey(Constants.SERVER_JOB_NAME)));
+ Assert.assertEquals("1",
+
tonyConf.get(TonyConfigurationKeys.getInstancesKey(Constants.SCHEDULER_JOB_NAME)));
+ Assert.assertEquals("2048", tonyConf.get(TonyConfigurationKeys
+ .getResourceKey(Constants.SCHEDULER_JOB_NAME, Constants.MEMORY)));
+ Assert.assertEquals("2", tonyConf.get(TonyConfigurationKeys
+ .getResourceKey(Constants.SCHEDULER_JOB_NAME, Constants.VCORES)));
+ Assert.assertEquals(mxNetParams.getSchedulerLaunchCmd(),
+
tonyConf.get(TonyConfigurationKeys.getExecuteCommandKey(Constants.SCHEDULER_JOB_NAME)));
+ // Update after SUBMARINE-104 is merged into tony.
+ //Assert.assertEquals("SUBMARINE",
tonyConf.get(TonyConfigurationKeys.APPLICATION_TYPE));
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]