This is an automated email from the ASF dual-hosted git repository.

zhouquan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/submarine.git


The following commit(s) were added to refs/heads/master by this push:
     new 6e19ae9  SUBMARINE-202. submarine core need to support MXNet
6e19ae9 is described below

commit 6e19ae95d0977a6fdf5708a94c5f113f67aff25c
Author: Ryan Lo <[email protected]>
AuthorDate: Thu Feb 13 15:22:45 2020 +0800

    SUBMARINE-202. submarine core need to support MXNet
    
    ### What is this PR for?
    To support MXNet framework in Submarine
    
    ### What type of PR is it?
    [Improvement]
    
    ### Todos
    * [ ] - Task
    
    ### What is the Jira issue?
    
[SUBMARINE-202](https://issues.apache.org/jira/projects/SUBMARINE/issues/SUBMARINE-202)
    
    ### How should this be tested?
    [passed CI](https://travis-ci.org/lowc1012/submarine/builds/646785076)
    
    ### Screenshots (if appropriate)
    <img width="1212" alt="screenshot1" 
src="https://user-images.githubusercontent.com/52355146/73934141-d207f100-4918-11ea-86a4-920f084ff76e.png";>
    <img width="1214" alt="screenshot2" 
src="https://user-images.githubusercontent.com/52355146/73934180-e8ae4800-4918-11ea-8c8e-6baa2c29f2af.png";>
    
    ### Questions:
    * Does the licenses files need update? No
    * Is there breaking changes for older versions? No
    * Does this needs documentation? No
    
    Author: Ryan Lo <[email protected]>
    
    Closes #174 from lowc1012/SUBMARINE-202 and squashes the following commits:
    
    6af5f18 [Ryan Lo] SUBMARINE-202. submarine core need to support MXNet
---
 .../apache/submarine/client/cli/CliConstants.java  |   4 +
 .../client/cli/param/ParametersHolder.java         |  42 +++-
 .../cli/param/runjob/MXNetRunJobParameters.java    | 236 +++++++++++++++++++++
 .../cli/param/runjob/PyTorchRunJobParameters.java  |  14 ++
 .../param/runjob/TensorFlowRunJobParameters.java   |  27 +++
 .../submarine/client/cli/param/yaml/Roles.java     |   9 +
 .../submarine/client/cli/runjob/RunJobCli.java     |  44 +++-
 .../runjob/mxnet/RunJobCliParsingMXNetTest.java    | 175 +++++++++++++++
 .../RunJobCliParsingMXNetYamlTest.java}            | 144 +++++++------
 .../pytorch/RunJobCliParsingPyTorchTest.java       |  71 +++++++
 .../pytorch/RunJobCliParsingPyTorchYamlTest.java   |  14 +-
 .../tensorflow/RunJobCliParsingTensorFlowTest.java |  67 ++++++
 .../RunJobCliParsingTensorFlowYamlTest.java        |  13 ++
 .../runjob-mxnet-yaml/envs-are-missing.yaml        |  61 ++++++
 .../invalid-config-tensorboard-section.yaml        |  67 ++++++
 .../security-principal-is-missing.yaml             |  63 ++++++
 .../valid-config-with-overrides.yaml               |  91 ++++++++
 .../resources/runjob-mxnet-yaml/valid-config.yaml  |  64 ++++++
 .../runjob-mxnet-yaml/valid-gpu-config.yaml        |  64 ++++++
 .../invalid-config-scheduler-section.yaml          |  56 +++++
 .../invalid-config-scheduler-section.yaml          |  61 ++++++
 .../submarine/commons/runtime/Framework.java       |   3 +-
 .../submarine/commons/runtime/api/MXNetRole.java   |  30 +--
 .../submarine/server/submitter/yarn/YarnUtils.java | 175 +++++++++------
 .../src/test/java/YarnUtilsTest.java               |  59 ++++++
 25 files changed, 1496 insertions(+), 158 deletions(-)

diff --git 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/CliConstants.java
 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/CliConstants.java
index d99d56e..e89ad1b 100644
--- 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/CliConstants.java
+++ 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/CliConstants.java
@@ -37,6 +37,8 @@ public class CliConstants {
   public static final String WORKER_RES = "worker_resources";
   public static final String SERVING_RES = "serving_resources";
   public static final String PS_RES = "ps_resources";
+  public static final String SCHEDULER_RES = "scheduler_resources";
+  public static final String N_SCHEDULERS = "num_schedulers";
   public static final String DOCKER_IMAGE = "docker_image";
   public static final String QUEUE = "queue";
   public static final String TENSORBOARD = "tensorboard";
@@ -48,6 +50,7 @@ public class CliConstants {
   public static final String WORKER_LAUNCH_CMD = "worker_launch_cmd";
   public static final String SERVING_LAUNCH_CMD = "serving_launch_cmd";
   public static final String PS_LAUNCH_CMD = "ps_launch_cmd";
+  public static final String SCHEDULER_LAUNCH_CMD = "scheduler_launch_cmd";
   public static final String ENV = "env";
   public static final String VERBOSE = "verbose";
   public static final String SERVING_FRAMEWORK = "serving_framework";
@@ -55,6 +58,7 @@ public class CliConstants {
   public static final String WAIT_JOB_FINISH = "wait_job_finish";
   public static final String PS_DOCKER_IMAGE = "ps_docker_image";
   public static final String WORKER_DOCKER_IMAGE = "worker_docker_image";
+  public static final String SCHEDULER_DOCKER_IMAGE = "scheduler_docker_image";
   public static final String QUICKLINK = "quicklink";
   public static final String TENSORBOARD_DOCKER_IMAGE =
       "tensorboard_docker_image";
diff --git 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/ParametersHolder.java
 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/ParametersHolder.java
index 44891f7..556dfe6 100644
--- 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/ParametersHolder.java
+++ 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/ParametersHolder.java
@@ -27,6 +27,7 @@ import org.apache.commons.cli.ParseException;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.submarine.client.cli.CliConstants;
 import org.apache.submarine.client.cli.Command;
+import org.apache.submarine.client.cli.param.runjob.MXNetRunJobParameters;
 import org.apache.submarine.client.cli.param.runjob.PyTorchRunJobParameters;
 import org.apache.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
 import org.apache.submarine.client.cli.param.yaml.Configs;
@@ -69,7 +70,7 @@ public final class ParametersHolder implements Parameter {
       LoggerFactory.getLogger(ParametersHolder.class);
 
   public static final String SUPPORTED_FRAMEWORKS_MESSAGE =
-      "TensorFlow and PyTorch are the only supported frameworks for now!";
+      "TensorFlow, PyTorch, MXNet are the only supported frameworks for now!";
   public static final String SUPPORTED_COMMANDS_MESSAGE =
       "'Show job' and 'run job' are the only supported commands for now!";
 
@@ -108,6 +109,8 @@ public final class ParametersHolder implements Parameter {
         return new TensorFlowRunJobParameters();
       } else if (framework == Framework.PYTORCH) {
         return new PyTorchRunJobParameters();
+      } else if (framework == Framework.MXNET) {
+        return new MXNetRunJobParameters();
       } else {
         throw new UnsupportedOperationException(SUPPORTED_FRAMEWORKS_MESSAGE);
       }
@@ -126,11 +129,18 @@ public final class ParametersHolder implements Parameter {
               "is the selected framework!");
     }
 
-    if (isCommandRunJob() && isFrameworkPyTorch() &&
-        isTensorboardSectionDefined(yamlConfig)) {
+    if (isCommandRunJob() && (isFrameworkPyTorch() || isFrameworkMXNet())
+        && isTensorboardSectionDefined(yamlConfig)) {
       throw new YamlParseException(
-          "TensorBoard section should not be defined when PyTorch " +
-              "is the selected framework!");
+          "TensorBoard section should not be defined when TensorFlow " +
+              "is not the selected framework!");
+    }
+
+    if (isCommandRunJob() && !isFrameworkMXNet() &&
+        isSchedulerSectionDefined(yamlConfig)) {
+      throw new YamlParseException(
+          "Scheduler section should not be defined when MXNet " +
+              "is not the selected framework!");
     }
   }
 
@@ -142,6 +152,10 @@ public final class ParametersHolder implements Parameter {
     return framework == Framework.PYTORCH;
   }
 
+  private boolean isFrameworkMXNet() {
+    return framework == Framework.MXNET;
+  }
+
   private boolean isPsSectionDefined(YamlConfigFile yamlConfig) {
     return yamlConfig != null &&
         yamlConfig.getRoles() != null &&
@@ -153,6 +167,12 @@ public final class ParametersHolder implements Parameter {
         yamlConfig.getTensorBoard() != null;
   }
 
+  private boolean isSchedulerSectionDefined(YamlConfigFile yamlConfig) {
+    return yamlConfig != null &&
+        yamlConfig.getRoles() != null &&
+        yamlConfig.getRoles().getScheduler() != null;
+  }
+
   private Framework determineFrameworkType()
       throws ParseException, YarnException {
     if (!isCommandRunJob()) {
@@ -195,6 +215,7 @@ public final class ParametersHolder implements Parameter {
     initGenericConfigs(yamlConfig, yamlConfigValues);
     initPs(yamlConfigValues, roles.getPs());
     initWorker(yamlConfigValues, roles.getWorker());
+    initScheduler(yamlConfigValues, roles.getScheduler());
     initScheduling(yamlConfigValues, yamlConfig.getScheduling());
     initSecurity(yamlConfigValues, yamlConfig.getSecurity());
     initTensorBoard(yamlConfigValues, yamlConfig.getTensorBoard());
@@ -253,6 +274,17 @@ public final class ParametersHolder implements Parameter {
     yamlConfigs.put(CliConstants.WORKER_LAUNCH_CMD, worker.getLaunchCmd());
   }
 
+  private void initScheduler(Map<String, String> yamlConfigs, Role scheduler) {
+    if (scheduler == null) {
+      return;
+    }
+    yamlConfigs.put(CliConstants.N_SCHEDULERS,
+            String.valueOf(scheduler.getReplicas()));
+    yamlConfigs.put(CliConstants.SCHEDULER_RES, scheduler.getResources());
+    yamlConfigs.put(CliConstants.SCHEDULER_DOCKER_IMAGE, 
scheduler.getDockerImage());
+    yamlConfigs.put(CliConstants.SCHEDULER_LAUNCH_CMD, 
scheduler.getLaunchCmd());
+  }
+
   private void initScheduling(Map<String, String> yamlConfigValues,
       Scheduling scheduling) {
     if (scheduling == null) {
diff --git 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/MXNetRunJobParameters.java
 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/MXNetRunJobParameters.java
new file mode 100644
index 0000000..4c4671b
--- /dev/null
+++ 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/MXNetRunJobParameters.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.submarine.client.cli.param.runjob;
+
+import com.google.common.collect.Lists;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.submarine.client.cli.CliConstants;
+import org.apache.submarine.client.cli.CliUtils;
+import org.apache.submarine.client.cli.runjob.RoleParameters;
+import org.apache.submarine.commons.runtime.ClientContext;
+import org.apache.submarine.commons.runtime.api.MXNetRole;
+import org.apache.submarine.commons.runtime.param.Parameter;
+import org.apache.submarine.commons.runtime.resource.ResourceUtils;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * Parameters for MXNet job.
+ */
+public class MXNetRunJobParameters extends RunJobParameters {
+    private RoleParameters psParameters =
+            RoleParameters.createEmpty(MXNetRole.PS);
+
+    private RoleParameters schedulerParameters =
+            RoleParameters.createEmpty(MXNetRole.SCHEDULER);
+
+    private static final String CANNOT_BE_DEFINED_FOR_MXNET =
+            "cannot be defined for MXNet jobs!";
+
+    @Override
+    public void updateParameters(Parameter parametersHolder, ClientContext 
clientContext)
+            throws ParseException, IOException, YarnException {
+
+        checkArguments(parametersHolder);
+        super.updateParameters(parametersHolder, clientContext);
+
+        String input = 
parametersHolder.getOptionValue(CliConstants.INPUT_PATH);
+        this.workerParameters = generateWorkerParameters(clientContext, 
parametersHolder, input);
+        this.psParameters = getPSParameters(parametersHolder);
+        this.schedulerParameters = getSchedulerParameters(parametersHolder);
+        this.distributed = 
determineIfDistributed(workerParameters.getReplicas(),
+                psParameters.getReplicas(), schedulerParameters.getReplicas());
+
+        executePostOperations(clientContext);
+    }
+
+    @Override
+    void executePostOperations(ClientContext clientContext) throws IOException 
{
+        // Set default job dir / saved model dir, etc.
+        setDefaultDirs(clientContext);
+        replacePatternsInParameters(clientContext);
+    }
+
+    @Override
+    public List<String> getLaunchCommands() {
+        return Lists.newArrayList(getWorkerLaunchCmd(), getPSLaunchCmd(), 
getSchedulerLaunchCmd());
+    }
+
+    private void replacePatternsInParameters(ClientContext clientContext)
+            throws IOException {
+        if (StringUtils.isNotEmpty(getPSLaunchCmd())) {
+            String afterReplace =
+                    CliUtils.replacePatternsInLaunchCommand(getPSLaunchCmd(), 
this,
+                            clientContext.getRemoteDirectoryManager());
+            setPSLaunchCmd(afterReplace);
+        }
+        if (StringUtils.isNotEmpty(getWorkerLaunchCmd())) {
+            String afterReplace =
+                    
CliUtils.replacePatternsInLaunchCommand(getWorkerLaunchCmd(), this,
+                            clientContext.getRemoteDirectoryManager());
+            setWorkerLaunchCmd(afterReplace);
+        }
+        if (StringUtils.isNotEmpty(getSchedulerLaunchCmd())) {
+            String afterReplace =
+                    
CliUtils.replacePatternsInLaunchCommand(getSchedulerLaunchCmd(), this,
+                            clientContext.getRemoteDirectoryManager());
+            setSchedulerLaunchCmd(afterReplace);
+        }
+    }
+
+    private void checkArguments(Parameter parametersHolder)
+        throws YarnException, ParseException {
+        if (parametersHolder.hasOption(CliConstants.TENSORBOARD)) {
+            throw new ParseException(getParamCannotBeDefinedErrorMessage(
+                    CliConstants.TENSORBOARD));
+        } else if (parametersHolder
+                .getOptionValue(CliConstants.TENSORBOARD_RESOURCES) != null) {
+            throw new ParseException(getParamCannotBeDefinedErrorMessage(
+                    CliConstants.TENSORBOARD_RESOURCES));
+        } else if (parametersHolder
+                .getOptionValue(CliConstants.TENSORBOARD_DOCKER_IMAGE) != 
null) {
+            throw new ParseException(getParamCannotBeDefinedErrorMessage(
+                    CliConstants.TENSORBOARD_DOCKER_IMAGE));
+        }
+    }
+
+    private boolean determineIfDistributed(int nWorkers, int nPS, int 
nSchedulers) {
+        return nWorkers >= 2 && nPS > 0 && nSchedulers == 1;
+    }
+
+    private String getParamCannotBeDefinedErrorMessage(String cliName) {
+        return String.format(
+                "Parameter '%s' " + CANNOT_BE_DEFINED_FOR_MXNET, cliName);
+    }
+
+    private RoleParameters getPSParameters(Parameter parametersHolder)
+            throws YarnException, ParseException {
+        int nPS = getNumberOfPS(parametersHolder);
+        Resource psResource =
+                determinePSResource(parametersHolder, nPS);
+        String psDockerImage =
+                parametersHolder.getOptionValue(CliConstants.PS_DOCKER_IMAGE);
+        String psLaunchCommand =
+                parametersHolder.getOptionValue(CliConstants.PS_LAUNCH_CMD);
+        return new RoleParameters(MXNetRole.PS, nPS, psLaunchCommand,
+                psDockerImage, psResource);
+    }
+
+    private Resource determinePSResource(Parameter parametersHolder, int nPS)
+            throws ParseException, YarnException {
+        if (nPS > 0) {
+            String psResourceStr =
+                    parametersHolder.getOptionValue(CliConstants.PS_RES);
+            if (psResourceStr == null) {
+                throw new ParseException("--" + CliConstants.PS_RES + " is 
absent.");
+            }
+            return ResourceUtils.createResourceFromString(psResourceStr);
+        }
+        return null;
+    }
+
+    public String getPSLaunchCmd() {
+        return psParameters.getLaunchCommand();
+    }
+
+    public void setPSLaunchCmd(String launchCmd) {
+        psParameters.setLaunchCommand(launchCmd);
+    }
+
+    private int getNumberOfPS(Parameter parametersHolder) throws YarnException 
{
+        int nPS = 0;
+        if (parametersHolder.getOptionValue(CliConstants.N_PS) != null) {
+            nPS = 
Integer.parseInt(parametersHolder.getOptionValue(CliConstants.N_PS));
+        }
+        return nPS;
+    }
+
+    private RoleParameters getSchedulerParameters(Parameter parametersHolder)
+            throws YarnException, ParseException {
+        int nSchedulers = getNumberOfScheduler(parametersHolder);
+        Resource schedulerResource =
+                determineSchedulerResource(parametersHolder, nSchedulers);
+        String schedulerDockerImage =
+                
parametersHolder.getOptionValue(CliConstants.SCHEDULER_DOCKER_IMAGE);
+        String schedulerLaunchCommand =
+                
parametersHolder.getOptionValue(CliConstants.SCHEDULER_LAUNCH_CMD);
+        return new RoleParameters(MXNetRole.SCHEDULER, nSchedulers, 
schedulerLaunchCommand,
+                schedulerDockerImage, schedulerResource);
+    }
+
+    private Resource determineSchedulerResource(Parameter parametersHolder, 
int nSchedulers)
+            throws ParseException, YarnException {
+        if (nSchedulers > 0) {
+            String schedulerResourceStr = 
parametersHolder.getOptionValue(CliConstants.SCHEDULER_RES);
+            if (schedulerResourceStr == null) {
+                throw new ParseException("--" + CliConstants.SCHEDULER_RES + " 
is absent.");
+            }
+            return 
ResourceUtils.createResourceFromString(schedulerResourceStr);
+        }
+        return null;
+    }
+
+    private int getNumberOfScheduler(Parameter parametersHolder) throws 
ParseException, YarnException {
+        int nSchedulers = 0;
+        if (parametersHolder.getOptionValue(CliConstants.N_SCHEDULERS) != 
null) {
+            nSchedulers = 
Integer.parseInt(parametersHolder.getOptionValue(CliConstants.N_SCHEDULERS));
+            if (nSchedulers > 1 || nSchedulers < 0) {
+                throw new ParseException("--" + CliConstants.N_SCHEDULERS + " 
should be 1 or 0");
+            }
+        }
+        return nSchedulers;
+    }
+
+    public String getSchedulerLaunchCmd() {
+        return schedulerParameters.getLaunchCommand();
+    }
+
+    public void setSchedulerLaunchCmd(String launchCmd) {
+        schedulerParameters.setLaunchCommand(launchCmd);
+    }
+
+    public int getNumPS() {
+        return psParameters.getReplicas();
+    }
+
+    public Resource getPsResource() {
+        return psParameters.getResource();
+    }
+
+    public String getPsDockerImage() {
+        return psParameters.getDockerImage();
+    }
+
+    public int getNumSchedulers() {
+        return schedulerParameters.getReplicas();
+    }
+
+    public Resource getSchedulerResource() {
+        return schedulerParameters.getResource();
+    }
+
+    public String getSchedulerDockerImage() {
+        return schedulerParameters.getDockerImage();
+    }
+}
diff --git 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/PyTorchRunJobParameters.java
 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/PyTorchRunJobParameters.java
index 80767cf..ad0b64f 100644
--- 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/PyTorchRunJobParameters.java
+++ 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/PyTorchRunJobParameters.java
@@ -81,6 +81,20 @@ public class PyTorchRunJobParameters extends 
RunJobParameters {
         .getOptionValue(CliConstants.TENSORBOARD_DOCKER_IMAGE) != null) {
       throw new ParseException(getParamCannotBeDefinedErrorMessage(
           CliConstants.TENSORBOARD_DOCKER_IMAGE));
+    } else if (parametersHolder.getOptionValue(CliConstants.N_SCHEDULERS) != 
null) {
+      throw new ParseException(getParamCannotBeDefinedErrorMessage(
+          CliConstants.N_SCHEDULERS));
+    } else if (parametersHolder.getOptionValue(CliConstants.SCHEDULER_RES) != 
null) {
+      throw new ParseException(getParamCannotBeDefinedErrorMessage(
+          CliConstants.SCHEDULER_RES));
+    } else if (parametersHolder
+        .getOptionValue(CliConstants.SCHEDULER_DOCKER_IMAGE) != null) {
+      throw new ParseException(getParamCannotBeDefinedErrorMessage(
+          CliConstants.SCHEDULER_DOCKER_IMAGE));
+    } else if (parametersHolder
+        .getOptionValue(CliConstants.SCHEDULER_LAUNCH_CMD) != null) {
+      throw new ParseException(getParamCannotBeDefinedErrorMessage(
+          CliConstants.SCHEDULER_LAUNCH_CMD));
     }
   }
 
diff --git 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/TensorFlowRunJobParameters.java
 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/TensorFlowRunJobParameters.java
index 06f2bc0..925c3f3 100644
--- 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/TensorFlowRunJobParameters.java
+++ 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/runjob/TensorFlowRunJobParameters.java
@@ -40,6 +40,8 @@ import java.util.List;
  */
 public class TensorFlowRunJobParameters extends RunJobParameters {
   private boolean tensorboardEnabled;
+  private static final String CANNOT_BE_DEFINED_FOR_TF =
+      "cannot be defined for TensorFlow jobs!";
   private RoleParameters psParameters =
       RoleParameters.createEmpty(TensorFlowRole.PS);
   private RoleParameters tensorBoardParameters =
@@ -48,6 +50,7 @@ public class TensorFlowRunJobParameters extends 
RunJobParameters {
   @Override
   public void updateParameters(Parameter parametersHolder, ClientContext 
clientContext)
       throws ParseException, IOException, YarnException {
+    checkArguments(parametersHolder);
     super.updateParameters(parametersHolder, clientContext);
 
     String input = parametersHolder.getOptionValue(CliConstants.INPUT_PATH);
@@ -72,6 +75,30 @@ public class TensorFlowRunJobParameters extends 
RunJobParameters {
     replacePatternsInParameters(clientContext);
   }
 
+  private void checkArguments(Parameter parametersHolder)
+      throws YarnException, ParseException {
+    if (parametersHolder.getOptionValue(CliConstants.N_SCHEDULERS) != null) {
+      throw new ParseException(getParamCannotBeDefinedErrorMessage(
+          CliConstants.N_SCHEDULERS));
+    } else if (parametersHolder.getOptionValue(CliConstants.SCHEDULER_RES) != 
null) {
+      throw new ParseException(getParamCannotBeDefinedErrorMessage(
+          CliConstants.SCHEDULER_RES));
+    } else if (parametersHolder
+        .getOptionValue(CliConstants.SCHEDULER_DOCKER_IMAGE) != null) {
+      throw new ParseException(getParamCannotBeDefinedErrorMessage(
+          CliConstants.SCHEDULER_DOCKER_IMAGE));
+    } else if (parametersHolder
+        .getOptionValue(CliConstants.SCHEDULER_LAUNCH_CMD) != null) {
+      throw new ParseException(getParamCannotBeDefinedErrorMessage(
+          CliConstants.SCHEDULER_LAUNCH_CMD));
+    }
+  }
+
+  private String getParamCannotBeDefinedErrorMessage(String cliName) {
+    return String.format(
+        "Parameter '%s' " + CANNOT_BE_DEFINED_FOR_TF, cliName);
+  }
+
   private void replacePatternsInParameters(ClientContext clientContext)
       throws IOException {
     if (StringUtils.isNotEmpty(getPSLaunchCmd())) {
diff --git 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
index fab6f31..810c36f 100644
--- 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
+++ 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
@@ -25,6 +25,7 @@ package org.apache.submarine.client.cli.param.yaml;
 public class Roles {
   private Role worker;
   private Role ps;
+  private Role scheduler;
 
   public Role getWorker() {
     return worker;
@@ -41,4 +42,12 @@ public class Roles {
   public void setPs(Role ps) {
     this.ps = ps;
   }
+
+  public Role getScheduler() {
+    return scheduler;
+  }
+
+  public void setScheduler(Role scheduler) {
+    this.scheduler = scheduler;
+  }
 }
diff --git 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/runjob/RunJobCli.java
 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/runjob/RunJobCli.java
index 69426d5..c317d3f 100644
--- 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/runjob/RunJobCli.java
+++ 
b/submarine-client/src/main/java/org/apache/submarine/client/cli/runjob/RunJobCli.java
@@ -60,10 +60,14 @@ import java.util.Map;
 public class RunJobCli extends AbstractCli {
   private static final Logger LOG =
       LoggerFactory.getLogger(RunJobCli.class);
-  private static final String CAN_BE_USED_WITH_TF_PYTORCH =
-      "Can be used with TensorFlow or PyTorch frameworks.";
+  private static final String CAN_BE_USED_WITH_TF_PYTORCH_MXNET =
+          "Can be used with TensorFlow or PyTorch or MXNet frameworks.";
+  private static final String CAN_BE_USED_WITH_TF_MXNET =
+      "Can be used with TensorFlow or MXNet frameworks.";
   private static final String CAN_BE_USED_WITH_TF_ONLY =
       "Can only be used with TensorFlow framework.";
+  private static final String CAN_BE_USED_WITH_MXNET_ONLY =
+          "Can only be used with MXNet framework.";
   public static final String YAML_PARSE_FAILED = "Failed to parse " +
       "YAML config";
 
@@ -117,6 +121,7 @@ public class RunJobCli extends AbstractCli {
 
     addWorkerOptions(options);
     addPSOptions(options);
+    addSchedulerOptions(options);
     addTensorboardOptions(options);
 
     options.addOption(CliConstants.ENV, true,
@@ -170,37 +175,37 @@ public class RunJobCli extends AbstractCli {
   private void addWorkerOptions(Options options) {
     options.addOption(CliConstants.N_WORKERS, true,
         "Number of worker tasks of the job, by default it's 1." +
-            CAN_BE_USED_WITH_TF_PYTORCH);
+            CAN_BE_USED_WITH_TF_PYTORCH_MXNET);
     options.addOption(CliConstants.WORKER_DOCKER_IMAGE, true,
         "Specify docker image for WORKER, when this is not specified, WORKER "
             + "uses --" + CliConstants.DOCKER_IMAGE + " as default." +
-            CAN_BE_USED_WITH_TF_PYTORCH);
+            CAN_BE_USED_WITH_TF_PYTORCH_MXNET);
     options.addOption(CliConstants.WORKER_LAUNCH_CMD, true,
         "Commandline of worker, arguments will be "
             + "directly used to launch the worker" +
-            CAN_BE_USED_WITH_TF_PYTORCH);
+            CAN_BE_USED_WITH_TF_PYTORCH_MXNET);
     options.addOption(CliConstants.WORKER_RES, true,
         "Resource of each worker, for example "
             + "memory-mb=2048,vcores=2,yarn.io/gpu=2" +
-            CAN_BE_USED_WITH_TF_PYTORCH);
+            CAN_BE_USED_WITH_TF_PYTORCH_MXNET);
   }
 
   private void addPSOptions(Options options) {
     options.addOption(CliConstants.N_PS, true,
         "Number of PS tasks of the job, by default it's 0. " +
-            CAN_BE_USED_WITH_TF_ONLY);
+            CAN_BE_USED_WITH_TF_MXNET);
     options.addOption(CliConstants.PS_DOCKER_IMAGE, true,
         "Specify docker image for PS, when this is not specified, PS uses --"
             + CliConstants.DOCKER_IMAGE + " as default." +
-            CAN_BE_USED_WITH_TF_ONLY);
+            CAN_BE_USED_WITH_TF_MXNET);
     options.addOption(CliConstants.PS_LAUNCH_CMD, true,
-        "Commandline of worker, arguments will be "
+        "Commandline of PS, arguments will be "
             + "directly used to launch the PS" +
-            CAN_BE_USED_WITH_TF_ONLY);
+            CAN_BE_USED_WITH_TF_MXNET);
     options.addOption(CliConstants.PS_RES, true,
         "Resource of each PS, for example "
             + "memory-mb=2048,vcores=2,yarn.io/gpu=2" +
-            CAN_BE_USED_WITH_TF_ONLY);
+            CAN_BE_USED_WITH_TF_MXNET);
   }
 
   private void addTensorboardOptions(Options options) {
@@ -219,6 +224,23 @@ public class RunJobCli extends AbstractCli {
             CAN_BE_USED_WITH_TF_ONLY);
   }
 
+  private void addSchedulerOptions(Options options) {
+    options.addOption(CliConstants.N_SCHEDULERS, true,
+        "Number of scheduler tasks of the job. " +
+        "It should be 1 or 0, by default it's 0."+
+        CAN_BE_USED_WITH_MXNET_ONLY);
+    options.addOption(CliConstants.SCHEDULER_DOCKER_IMAGE, true,
+        "Specify docker image for scheduler, when this is not specified, " +
+        "scheduler uses --" + CliConstants.DOCKER_IMAGE +
+        " as default. " + CAN_BE_USED_WITH_MXNET_ONLY);
+    options.addOption(CliConstants.SCHEDULER_LAUNCH_CMD, true,
+        "Commandline of scheduler, arguments will be " +
+        "directly used to launch the scheduler. " + 
CAN_BE_USED_WITH_MXNET_ONLY);
+    options.addOption(CliConstants.SCHEDULER_RES, true,
+        "Resource of each scheduler, for example " +
+        "memory-mb=2048,vcores=2. " + CAN_BE_USED_WITH_MXNET_ONLY);
+  }
+
   private void parseCommandLineAndGetRunJobParameters(String[] args)
       throws ParseException, IOException, YarnException {
     try {
diff --git 
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/mxnet/RunJobCliParsingMXNetTest.java
 
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/mxnet/RunJobCliParsingMXNetTest.java
new file mode 100644
index 0000000..c6c0678
--- /dev/null
+++ 
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/mxnet/RunJobCliParsingMXNetTest.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.submarine.client.cli.runjob.mxnet;
+
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.yarn.util.resource.Resources;
+import org.apache.submarine.client.cli.param.runjob.MXNetRunJobParameters;
+import org.apache.submarine.client.cli.param.runjob.RunJobParameters;
+import org.apache.submarine.client.cli.runjob.RunJobCli;
+import org.apache.submarine.client.cli.runjob.RunJobCliParsingCommonTest;
+import org.apache.submarine.commons.runtime.conf.SubmarineLogs;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import static org.junit.Assert.*;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Test class that verifies the correctness of MXNet
+ * CLI configuration parsing.
+ */
+public class RunJobCliParsingMXNetTest {
+  @Before
+  public void before() {
+    SubmarineLogs.verboseOff();
+  }
+
+  @Rule
+  public ExpectedException expectedException = ExpectedException.none();
+
+  @Test
+  public void testBasicRunJobForSingleNodeTraining() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+
+    runJobCli.run(
+        new String[]{"--framework", "mxnet", "--name", "my-job",
+            "--docker_image", "mxnet-docker:1.1.0",
+            "--input_path", "hdfs://input",
+            "--num_workers", "1", "--num_ps", "1", "--worker_launch_cmd",
+            "python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
+            "--ps_resources", "memory=4G,vcores=2", "--ps_launch_cmd",
+            "python run-ps.py", "--num_schedulers", "1", 
"--scheduler_launch_cmd",
+            "python run-scheduler.py", "--scheduler_resources", 
"memory=1024M,vcores=2",
+            "--verbose", "--wait_job_finish"});
+
+    RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
+    assertTrue(RunJobParameters.class +
+        " must be an instance of " +
+        MXNetRunJobParameters.class,
+        jobRunParameters instanceof MXNetRunJobParameters);
+    MXNetRunJobParameters mxNetParams =
+        (MXNetRunJobParameters) jobRunParameters;
+
+    assertEquals(jobRunParameters.getInputPath(), "hdfs://input");
+    assertEquals(mxNetParams.getNumWorkers(), 1);
+    assertEquals(mxNetParams.getWorkerLaunchCmd(), "python run-job.py");
+    assertEquals(Resources.createResource(2048, 2),
+        mxNetParams.getWorkerResource());
+    assertEquals(mxNetParams.getNumPS(), 1);
+    assertEquals(mxNetParams.getNumSchedulers(), 1);
+    assertTrue(SubmarineLogs.isVerbose());
+    assertTrue(jobRunParameters.isWaitJobFinish());
+  }
+
+  @Test
+  public void testBasicRunJobForDistributedTraining() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+    runJobCli.run(
+        new String[]{"--framework", "mxnet", "--name", "my-job",
+            "--docker_image", "mxnet-docker:1.1.0",
+            "--input_path", "hdfs://input",
+            "--num_workers", "2", "--num_ps", "2", "--worker_launch_cmd",
+            "python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
+            "--ps_resources", "memory=4G,vcores=2", "--ps_launch_cmd",
+            "python run-ps.py", "--num_schedulers", "1", 
"--scheduler_launch_cmd",
+            "python run-scheduler.py", "--scheduler_resources", 
"memory=1024M,vcores=2",
+            "--verbose"});
+    RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
+    assertTrue(RunJobParameters.class +
+        " must be an instance of " +
+        MXNetRunJobParameters.class,
+        jobRunParameters instanceof MXNetRunJobParameters);
+    MXNetRunJobParameters mxNetParams =
+        (MXNetRunJobParameters) jobRunParameters;
+
+    assertEquals(jobRunParameters.getInputPath(), "hdfs://input");
+    assertEquals(jobRunParameters.getDockerImageName(), "mxnet-docker:1.1.0");
+    assertEquals(mxNetParams.getNumWorkers(), 2);
+    assertEquals(Resources.createResource(2048, 2),
+        mxNetParams.getWorkerResource());
+    assertEquals(mxNetParams.getWorkerLaunchCmd(), "python run-job.py");
+    assertEquals(mxNetParams.getNumPS(), 2);
+    assertEquals(Resources.createResource(4096, 2),
+        mxNetParams.getPsResource());
+    assertEquals(mxNetParams.getPSLaunchCmd(), "python run-ps.py");
+    assertEquals(mxNetParams.getNumSchedulers(), 1);
+    assertEquals(Resources.createResource(1024, 2),
+        mxNetParams.getSchedulerResource());
+    assertEquals(mxNetParams.getSchedulerLaunchCmd(), "python 
run-scheduler.py");
+    assertTrue(SubmarineLogs.isVerbose());
+  }
+
+  @Test
+  public void testTensorboardCannotBeDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+
+    expectedException.expect(ParseException.class);
+    expectedException.expectMessage("cannot be defined for MXNet jobs");
+    runJobCli.run(
+        new String[]{"--framework", "mxnet",
+            "--name", "my-job", "--docker_image", "mxnet-docker:1.1.0",
+            "--input_path", "hdfs://input",
+            "--num_workers", "2",
+            "--worker_launch_cmd", "python run-job.py",
+            "--worker_resources", "memory=2048M,vcores=2",
+            "--tensorboard"});
+  }
+
+  @Test
+  public void testTensorboardResourcesCannotBeDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+
+    expectedException.expect(ParseException.class);
+    expectedException.expectMessage("cannot be defined for MXNet jobs");
+    runJobCli.run(
+        new String[]{"--framework", "mxnet",
+            "--name", "my-job", "--docker_image", "mxnet-docker:1.1.0",
+            "--input_path", "hdfs://input",
+            "--num_workers", "2",
+            "--worker_launch_cmd", "python run-job.py",
+            "--worker_resources", "memory=2048M,vcores=2",
+            "--tensorboard_resources", "memory=1024M,vcores=2"});
+  }
+
+  @Test
+  public void testTensorboardDockerImageCannotBeDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+
+    expectedException.expect(ParseException.class);
+    expectedException.expectMessage("cannot be defined for MXNet jobs");
+    runJobCli.run(
+        new String[]{"--framework", "mxnet",
+            "--name", "my-job", "--docker_image", "mxnet-docker:1.1.0",
+            "--input_path", "hdfs://input",
+            "--num_workers", "2",
+            "--worker_launch_cmd", "python run-job.py",
+            "--worker_resources", "memory=2048M,vcores=2",
+            "--tensorboard_docker_image", "TBDockerImage"});
+  }
+}
+
diff --git 
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
 
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/mxnet/RunJobCliParsingMXNetYamlTest.java
similarity index 71%
copy from 
submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
copy to 
submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/mxnet/RunJobCliParsingMXNetYamlTest.java
index 6c9c5b8..0aa60f6 100644
--- 
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
+++ 
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/mxnet/RunJobCliParsingMXNetYamlTest.java
@@ -17,48 +17,41 @@
  * under the License.
  */
 
-package org.apache.submarine.client.cli.runjob.pytorch;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.util.List;
+package org.apache.submarine.client.cli.runjob.mxnet;
 
+import com.google.common.collect.ImmutableList;
 import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.util.resource.Resources;
 import org.apache.submarine.client.cli.YamlConfigTestUtils;
-import org.apache.submarine.client.cli.param.runjob.PyTorchRunJobParameters;
+import org.apache.submarine.client.cli.param.runjob.MXNetRunJobParameters;
 import org.apache.submarine.client.cli.param.runjob.RunJobParameters;
 import org.apache.submarine.client.cli.param.yaml.YamlParseException;
 import org.apache.submarine.client.cli.runjob.RunJobCli;
+import org.apache.submarine.client.cli.runjob.RunJobCliParsingCommonTest;
 import org.apache.submarine.commons.runtime.conf.SubmarineLogs;
 import 
org.apache.submarine.commons.runtime.exception.SubmarineRuntimeException;
 import org.apache.submarine.commons.runtime.resource.ResourceUtils;
-import org.apache.hadoop.yarn.util.resource.Resources;
-import org.apache.submarine.client.cli.runjob.RunJobCliParsingCommonTest;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
+import org.junit.*;
 import org.junit.rules.ExpectedException;
-
-import com.google.common.collect.ImmutableList;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.File;
+import java.util.List;
+
+import static org.junit.Assert.*;
+import static org.junit.Assert.assertNotNull;
+
 /**
- * Test class that verifies the correctness of PyTorch
+ * Test class that verifies the correctness of MXNet
  * YAML configuration parsing.
  */
-public class RunJobCliParsingPyTorchYamlTest {
+public class RunJobCliParsingMXNetYamlTest {
   private static final String OVERRIDDEN_PREFIX = "overridden_";
-  private static final String DIR_NAME = "runjob-pytorch-yaml";
+  private static final String DIR_NAME = "runjob-mxnet-yaml";
   private File yamlConfig;
   private static Logger LOG = LoggerFactory.getLogger(
-      RunJobCliParsingPyTorchYamlTest.class);
+    RunJobCliParsingMXNetYamlTest.class);
 
   @Before
   public void before() {
@@ -82,7 +75,7 @@ public class RunJobCliParsingPyTorchYamlTest {
       List<String> expectedEnvs) {
     assertEquals("testInputPath", jobRunParameters.getInputPath());
     assertEquals("testCheckpointPath", jobRunParameters.getCheckpointPath());
-    Assert.assertEquals("testDockerImage", 
jobRunParameters.getDockerImageName());
+    assertEquals("testDockerImage", jobRunParameters.getDockerImageName());
 
     assertNotNull(jobRunParameters.getLocalizations());
     assertEquals(2, jobRunParameters.getLocalizations().size());
@@ -100,38 +93,64 @@ public class RunJobCliParsingPyTorchYamlTest {
     }
   }
 
-  private PyTorchRunJobParameters verifyWorkerCommonValues(RunJobParameters
-      jobRunParameters, String prefix) {
+  private void verifyPsValues(RunJobParameters jobRunParameters,
+      String prefix) {
     assertTrue(RunJobParameters.class + " must be an instance of " +
-            PyTorchRunJobParameters.class,
-        jobRunParameters instanceof PyTorchRunJobParameters);
-    PyTorchRunJobParameters pyTorchParams =
-        (PyTorchRunJobParameters) jobRunParameters;
-
-    assertEquals(3, pyTorchParams.getNumWorkers());
-    assertEquals(prefix + "testLaunchCmdWorker",
-        pyTorchParams.getWorkerLaunchCmd());
-    assertEquals(prefix + "testDockerImageWorker",
-        pyTorchParams.getWorkerDockerImage());
-    return pyTorchParams;
+            MXNetRunJobParameters.class,
+        jobRunParameters instanceof MXNetRunJobParameters);
+    MXNetRunJobParameters mxNetParams =
+        (MXNetRunJobParameters) jobRunParameters;
+
+    assertEquals(4, mxNetParams.getNumPS());
+    assertEquals(prefix + "testLaunchCmdPs", mxNetParams.getPSLaunchCmd());
+    assertEquals(prefix + "testDockerImagePs",
+        mxNetParams.getPsDockerImage());
+    assertEquals(Resources.createResource(20500, 34),
+        mxNetParams.getPsResource());
   }
 
-  private void verifyWorkerValues(RunJobParameters jobRunParameters,
+  private void verifySchedulerValues(RunJobParameters jobRunParameters,
       String prefix) {
-    PyTorchRunJobParameters pyTorchParams = verifyWorkerCommonValues
-        (jobRunParameters, prefix);
+    assertTrue(RunJobParameters.class + " must be an instance of " +
+        MXNetRunJobParameters.class, jobRunParameters instanceof 
MXNetRunJobParameters);
+    MXNetRunJobParameters mxNetParams = (MXNetRunJobParameters) 
jobRunParameters;
+    assertEquals(1, mxNetParams.getNumSchedulers());
+    assertEquals(prefix + "testLaunchCmdScheduler",
+        mxNetParams.getSchedulerLaunchCmd());
+    assertEquals(prefix + "testDockerImageScheduler", 
mxNetParams.getSchedulerDockerImage());
+    assertEquals(Resources.createResource(10240, 16),
+        mxNetParams.getSchedulerResource());
+  }
+
+  private void verifyWorkerValues(RunJobParameters jobRunParameters, String 
prefix) {
+    MXNetRunJobParameters mxNetParams =
+        verifyWorkerCommonValues(jobRunParameters, prefix);
     assertEquals(Resources.createResource(20480, 32),
-        pyTorchParams.getWorkerResource());
+        mxNetParams.getWorkerResource());
   }
 
-  private void verifyWorkerValuesWithGpu(RunJobParameters jobRunParameters,
-      String prefix) {
+  private MXNetRunJobParameters verifyWorkerCommonValues(
+          RunJobParameters jobRunParameters, String prefix) {
+    assertTrue(RunJobParameters.class + " must be an instance of " +
+                    MXNetRunJobParameters.class,
+            jobRunParameters instanceof MXNetRunJobParameters);
+    MXNetRunJobParameters mxNetParams =
+            (MXNetRunJobParameters) jobRunParameters;
 
-    PyTorchRunJobParameters pyTorchParams = verifyWorkerCommonValues
-        (jobRunParameters, prefix);
+    assertEquals(3, mxNetParams.getNumWorkers());
+    assertEquals(prefix + "testLaunchCmdWorker",
+            mxNetParams.getWorkerLaunchCmd());
+    assertEquals(prefix + "testDockerImageWorker",
+            mxNetParams.getWorkerDockerImage());
+    return mxNetParams;
+  }
+
+  private void verifyWorkerValuesWithGpu(RunJobParameters jobRunParameters, 
String prefix) {
+    MXNetRunJobParameters mxNetParams =
+        verifyWorkerCommonValues(jobRunParameters, prefix);
     Resource workResource = Resources.createResource(20480, 32);
     ResourceUtils.setResource(workResource, ResourceUtils.GPU_URI, 2);
-    assertEquals(workResource, pyTorchParams.getWorkerResource());
+    assertEquals(workResource, mxNetParams.getWorkerResource());
   }
 
   private void verifySecurityValues(RunJobParameters jobRunParameters) {
@@ -149,9 +168,10 @@ public class RunJobCliParsingPyTorchYamlTest {
         DIR_NAME + "/valid-config.yaml");
     runJobCli.run(
         new String[] {"-f", yamlConfig.getAbsolutePath(), "--verbose"});
-
     RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
     verifyBasicConfigValues(jobRunParameters);
+    verifyPsValues(jobRunParameters, "");
+    verifySchedulerValues(jobRunParameters, "");
     verifyWorkerValues(jobRunParameters, "");
     verifySecurityValues(jobRunParameters);
   }
@@ -176,6 +196,8 @@ public class RunJobCliParsingPyTorchYamlTest {
 
     RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
     verifyBasicConfigValues(jobRunParameters);
+    verifyPsValues(jobRunParameters, "");
+    verifySchedulerValues(jobRunParameters, "");
     verifyWorkerValuesWithGpu(jobRunParameters, "");
     verifySecurityValues(jobRunParameters);
   }
@@ -187,11 +209,14 @@ public class RunJobCliParsingPyTorchYamlTest {
 
     yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
         DIR_NAME + "/valid-config-with-overrides.yaml");
+
     runJobCli.run(
         new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
 
     RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
     verifyBasicConfigValues(jobRunParameters);
+    verifyPsValues(jobRunParameters, OVERRIDDEN_PREFIX);
+    verifySchedulerValues(jobRunParameters, OVERRIDDEN_PREFIX);
     verifyWorkerValues(jobRunParameters, OVERRIDDEN_PREFIX);
     verifySecurityValues(jobRunParameters);
   }
@@ -199,7 +224,6 @@ public class RunJobCliParsingPyTorchYamlTest {
   @Test
   public void testMissingPrincipalUnderSecuritySection() throws Exception {
     RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
-
     yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
         DIR_NAME + "/security-principal-is-missing.yaml");
     runJobCli.run(
@@ -207,6 +231,8 @@ public class RunJobCliParsingPyTorchYamlTest {
 
     RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
     verifyBasicConfigValues(jobRunParameters);
+    verifyPsValues(jobRunParameters, "");
+    verifySchedulerValues(jobRunParameters, "");
     verifyWorkerValues(jobRunParameters, "");
 
     //Verify security values
@@ -218,7 +244,6 @@ public class RunJobCliParsingPyTorchYamlTest {
   @Test
   public void testMissingEnvs() throws Exception {
     RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
-
     yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
         DIR_NAME + "/envs-are-missing.yaml");
     runJobCli.run(
@@ -226,34 +251,21 @@ public class RunJobCliParsingPyTorchYamlTest {
 
     RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
     verifyBasicConfigValues(jobRunParameters, ImmutableList.of());
+    verifyPsValues(jobRunParameters, "");
+    verifySchedulerValues(jobRunParameters, "");
     verifyWorkerValues(jobRunParameters, "");
     verifySecurityValues(jobRunParameters);
   }
 
   @Test
-  public void testInvalidConfigPsSectionIsDefined() throws Exception {
-    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
-
-    exception.expect(YamlParseException.class);
-    exception.expectMessage("PS section should not be defined " +
-        "when PyTorch is the selected framework");
-    yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
-        DIR_NAME + "/invalid-config-ps-section.yaml");
-    runJobCli.run(
-        new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
-  }
-
-  @Test
   public void testInvalidConfigTensorboardSectionIsDefined() throws Exception {
     RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
-
     exception.expect(YamlParseException.class);
     exception.expectMessage("TensorBoard section should not be defined " +
-        "when PyTorch is the selected framework");
+        "when TensorFlow is not the selected framework!");
     yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
         DIR_NAME + "/invalid-config-tensorboard-section.yaml");
     runJobCli.run(
         new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
   }
-
-}
+}
\ No newline at end of file
diff --git 
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchTest.java
 
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchTest.java
index 778c190..b1ec19b 100644
--- 
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchTest.java
+++ 
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchTest.java
@@ -208,4 +208,75 @@ public class RunJobCliParsingPyTorchTest {
             "--tensorboard_docker_image", "TBDockerImage"});
   }
 
+  @Test
+  public void testNumSchedulerCannotBeDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+
+    expectedException.expect(ParseException.class);
+    expectedException.expectMessage("cannot be defined for PyTorch jobs");
+    runJobCli.run(
+        new String[]{"--framework", "pytorch",
+            "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+            "--input_path", "hdfs://input",
+            "--checkpoint_path", "hdfs://output",
+            "--num_workers", "3",
+            "--worker_launch_cmd",
+            "python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
+            "--num_schedulers", "1"});
+  }
+
+  @Test
+  public void testSchedulerResourcesCannotBeDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+
+    expectedException.expect(ParseException.class);
+    expectedException.expectMessage("cannot be defined for PyTorch jobs");
+    runJobCli.run(
+        new String[]{"--framework", "pytorch",
+            "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+            "--input_path", "hdfs://input",
+            "--checkpoint_path", "hdfs://output",
+            "--num_workers", "3",
+            "--worker_launch_cmd", "python run-job.py",
+            "--worker_resources", "memory=2048M,vcores=2",
+            "--scheduler_resources", "memory=2048M,vcores=2"});
+  }
+
+  @Test
+  public void testSchedulerDockerImageCannotBeDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+
+    expectedException.expect(ParseException.class);
+    expectedException.expectMessage("cannot be defined for PyTorch jobs");
+    runJobCli.run(
+        new String[]{"--framework", "pytorch",
+            "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+            "--input_path", "hdfs://input",
+            "--checkpoint_path", "hdfs://output",
+            "--num_workers", "3",
+            "--worker_launch_cmd", "python run-job.py",
+            "--worker_resources", "memory=2048M,vcores=2",
+            "--scheduler_docker_image", "schedulerDockerImage"});
+  }
+
+  @Test
+  public void testSchedulerLaunchCommandCannotBeDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+
+    expectedException.expect(ParseException.class);
+    expectedException.expectMessage("cannot be defined for PyTorch jobs");
+    runJobCli.run(
+        new String[]{"--framework", "pytorch",
+            "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+            "--input_path", "hdfs://input",
+            "--checkpoint_path", "hdfs://output",
+            "--num_workers", "3",
+            "--worker_launch_cmd", "python run-job.py",
+            "--worker_resources", "memory=2048M,vcores=2",
+            "--scheduler_launch_cmd", "schedulerLaunchCommand"});
+  }
 }
diff --git 
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
 
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
index 6c9c5b8..2ab075f 100644
--- 
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
+++ 
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/pytorch/RunJobCliParsingPyTorchYamlTest.java
@@ -249,11 +249,23 @@ public class RunJobCliParsingPyTorchYamlTest {
 
     exception.expect(YamlParseException.class);
     exception.expectMessage("TensorBoard section should not be defined " +
-        "when PyTorch is the selected framework");
+        "when TensorFlow is not the selected framework!");
     yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
         DIR_NAME + "/invalid-config-tensorboard-section.yaml");
     runJobCli.run(
         new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
   }
 
+  @Test
+  public void testInvalidConfigSchedulerSectionIsDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+
+    exception.expect(YamlParseException.class);
+    exception.expectMessage("Scheduler section should not be defined " +
+        "when MXNet is not the selected framework!");
+    yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
+         DIR_NAME + "/invalid-config-scheduler-section.yaml");
+    runJobCli.run(
+        new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
+  }
 }
diff --git 
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowTest.java
 
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowTest.java
index 0947b7b..1dfaf42 100644
--- 
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowTest.java
+++ 
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowTest.java
@@ -169,4 +169,71 @@ public class RunJobCliParsingTensorFlowTest {
     }
     assertTrue(success);
   }
+
+  @Test
+  public void testNumSchedulerCannotBeDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+
+    expectedException.expect(ParseException.class);
+    expectedException.expectMessage("cannot be defined for TensorFlow jobs");
+    runJobCli.run(
+        new String[] {"--framework", "tensorflow",
+            "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+            "--input_path", "hdfs://input", "--checkpoint_path", 
"hdfs://output",
+            "--num_workers", "1", "--worker_launch_cmd", "python run-job.py",
+            "--worker_resources", "memory=4g,vcores=2", "--tensorboard", 
"true",
+            "--verbose", "--wait_job_finish", "--num_schedulers", "1"});
+  }
+
+  @Test
+  public void testSchedulerResourcesCannotBeDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+
+    expectedException.expect(ParseException.class);
+    expectedException.expectMessage("cannot be defined for TensorFlow jobs");
+    runJobCli.run(
+        new String[] {"--framework", "tensorflow",
+            "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+            "--input_path", "hdfs://input", "--checkpoint_path", 
"hdfs://output",
+            "--num_workers", "1", "--worker_launch_cmd", "python run-job.py",
+            "--worker_resources", "memory=4g,vcores=2", "--tensorboard", 
"true",
+            "--verbose", "--wait_job_finish",
+            "--scheduler_resources", "memory=2048M,vcores=2"});
+  }
+
+  @Test
+  public void testSchedulerDockerImageCannotBeDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+
+    expectedException.expect(ParseException.class);
+    expectedException.expectMessage("cannot be defined for TensorFlow jobs");
+    runJobCli.run(
+        new String[] {"--framework", "tensorflow",
+            "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+            "--input_path", "hdfs://input", "--checkpoint_path", 
"hdfs://output",
+            "--num_workers", "1", "--worker_launch_cmd", "python run-job.py",
+            "--worker_resources", "memory=4g,vcores=2", "--tensorboard", 
"true",
+            "--verbose", "--wait_job_finish",
+            "--scheduler_docker_image", "schedulerDockerImage"});
+  }
+
+  @Test
+  public void testSchedulerLaunchCommandCannotBeDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+    assertFalse(SubmarineLogs.isVerbose());
+
+    expectedException.expect(ParseException.class);
+    expectedException.expectMessage("cannot be defined for TensorFlow jobs");
+    runJobCli.run(
+        new String[] {"--framework", "tensorflow",
+            "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
+            "--input_path", "hdfs://input", "--checkpoint_path", 
"hdfs://output",
+            "--num_workers", "1", "--worker_launch_cmd", "python run-job.py",
+            "--worker_resources", "memory=4g,vcores=2", "--tensorboard", 
"true",
+            "--verbose", "--wait_job_finish",
+            "--scheduler_launch_cmd", "schedulerLaunchCommand"});
+  }
 }
diff --git 
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowYamlTest.java
 
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowYamlTest.java
index c4ddd0f..894fc08 100644
--- 
a/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowYamlTest.java
+++ 
b/submarine-client/src/test/java/org/apache/submarine/client/cli/runjob/tensorflow/RunJobCliParsingTensorFlowYamlTest.java
@@ -24,6 +24,7 @@ import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.submarine.client.cli.YamlConfigTestUtils;
 import org.apache.submarine.client.cli.param.runjob.RunJobParameters;
 import org.apache.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
+import org.apache.submarine.client.cli.param.yaml.YamlParseException;
 import org.apache.submarine.client.cli.runjob.RunJobCli;
 import org.apache.submarine.commons.runtime.conf.SubmarineLogs;
 import 
org.apache.submarine.commons.runtime.exception.SubmarineRuntimeException;
@@ -292,4 +293,16 @@ public class RunJobCliParsingTensorFlowYamlTest {
     verifyTensorboardValues(jobRunParameters);
   }
 
+  @Test
+  public void testInvalidConfigSchedulerSectionIsDefined() throws Exception {
+    RunJobCli runJobCli = new 
RunJobCli(RunJobCliParsingCommonTest.getMockClientContext());
+
+    exception.expect(YamlParseException.class);
+    exception.expectMessage("Scheduler section should not be defined " +
+        "when MXNet is not the selected framework!");
+    yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
+        DIR_NAME + "/invalid-config-scheduler-section.yaml");
+    runJobCli.run(
+        new String[]{"-f", yamlConfig.getAbsolutePath(), "--verbose"});
+  }
 }
diff --git 
a/submarine-client/src/test/resources/runjob-mxnet-yaml/envs-are-missing.yaml 
b/submarine-client/src/test/resources/runjob-mxnet-yaml/envs-are-missing.yaml
new file mode 100644
index 0000000..51998f8
--- /dev/null
+++ 
b/submarine-client/src/test/resources/runjob-mxnet-yaml/envs-are-missing.yaml
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+  name: testJobName
+  job_type: testJobType
+  framework: mxnet
+
+configs:
+  input_path: testInputPath
+  checkpoint_path: testCheckpointPath
+  saved_model_path: testSavedModelPath
+  docker_image: testDockerImage
+  wait_job_finish: true
+  localizations:
+    - hdfs://remote-file1:/local-filename1:rw
+    - nfs://remote-file2:/local-filename2:rw
+  mounts:
+    - /etc/passwd:/etc/passwd:rw
+    - /etc/hosts:/etc/hosts:rw
+  quicklinks:
+    - Notebook_UI=https://master-0:7070
+    - Notebook_UI2=https://master-0:7071
+
+scheduling:
+  queue: queue1
+
+roles:
+  worker:
+    resources: memory=20480M,vcores=32
+    replicas: 3
+    launch_cmd: testLaunchCmdWorker
+    docker_image: testDockerImageWorker
+  ps:
+    resources: memory=20500M,vcores=34
+    replicas: 4
+    launch_cmd: testLaunchCmdPs
+    docker_image: testDockerImagePs
+  scheduler:
+    resources: memory=10240M,vcores=16
+    replicas: 1
+    launch_cmd: testLaunchCmdScheduler
+    docker_image: testDockerImageScheduler
+
+security:
+  keytab: keytabPath
+  principal: testPrincipal
+  distribute_keytab: true
\ No newline at end of file
diff --git 
a/submarine-client/src/test/resources/runjob-mxnet-yaml/invalid-config-tensorboard-section.yaml
 
b/submarine-client/src/test/resources/runjob-mxnet-yaml/invalid-config-tensorboard-section.yaml
new file mode 100644
index 0000000..747f373
--- /dev/null
+++ 
b/submarine-client/src/test/resources/runjob-mxnet-yaml/invalid-config-tensorboard-section.yaml
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+  name: testJobName
+  job_type: testJobType
+  framework: mxnet
+
+configs:
+  input_path: testInputPath
+  checkpoint_path: testCheckpointPath
+  saved_model_path: testSavedModelPath
+  docker_image: testDockerImage
+  wait_job_finish: true
+  envs:
+    env1: 'env1Value'
+    env2: 'env2Value'
+  localizations:
+    - hdfs://remote-file1:/local-filename1:rw
+    - nfs://remote-file2:/local-filename2:rw
+  mounts:
+    - /etc/passwd:/etc/passwd:rw
+    - /etc/hosts:/etc/hosts:rw
+  quicklinks:
+    - Notebook_UI=https://master-0:7070
+    - Notebook_UI2=https://master-0:7071
+
+scheduling:
+  queue: queue1
+
+roles:
+  worker:
+    resources: memory=20480M,vcores=32
+    replicas: 3
+    launch_cmd: testLaunchCmdWorker
+    docker_image: testDockerImageWorker
+  ps:
+    resources: memory=20500M,vcores=34
+    replicas: 4
+    launch_cmd: testLaunchCmdPs
+    docker_image: testDockerImagePs
+  scheduler:
+    resources: memory=10240M,vcores=16
+    replicas: 1
+    launch_cmd: testLaunchCmdScheduler
+    docker_image: testDockerImageScheduler
+
+security:
+  keytab: keytabPath
+  principal: testPrincipal
+  distribute_keytab: true
+
+tensorBoard:
+  docker_image: tensorboardDockerImage
\ No newline at end of file
diff --git 
a/submarine-client/src/test/resources/runjob-mxnet-yaml/security-principal-is-missing.yaml
 
b/submarine-client/src/test/resources/runjob-mxnet-yaml/security-principal-is-missing.yaml
new file mode 100644
index 0000000..fe2d1f2
--- /dev/null
+++ 
b/submarine-client/src/test/resources/runjob-mxnet-yaml/security-principal-is-missing.yaml
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+  name: testJobName
+  job_type: testJobType
+  framework: mxnet
+
+configs:
+  input_path: testInputPath
+  checkpoint_path: testCheckpointPath
+  saved_model_path: testSavedModelPath
+  docker_image: testDockerImage
+  wait_job_finish: true
+  envs:
+    env1: 'env1Value'
+    env2: 'env2Value'
+  localizations:
+    - hdfs://remote-file1:/local-filename1:rw
+    - nfs://remote-file2:/local-filename2:rw
+  mounts:
+    - /etc/passwd:/etc/passwd:rw
+    - /etc/hosts:/etc/hosts:rw
+  quicklinks:
+    - Notebook_UI=https://master-0:7070
+    - Notebook_UI2=https://master-0:7071
+
+scheduling:
+  queue: queue1
+
+roles:
+  worker:
+    resources: memory=20480M,vcores=32
+    replicas: 3
+    launch_cmd: testLaunchCmdWorker
+    docker_image: testDockerImageWorker
+  ps:
+    resources: memory=20500M,vcores=34
+    replicas: 4
+    launch_cmd: testLaunchCmdPs
+    docker_image: testDockerImagePs
+  scheduler:
+    resources: memory=10240M,vcores=16
+    replicas: 1
+    launch_cmd: testLaunchCmdScheduler
+    docker_image: testDockerImageScheduler
+
+security:
+  keytab: keytabPath
+  distribute_keytab: true
\ No newline at end of file
diff --git 
a/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-config-with-overrides.yaml
 
b/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-config-with-overrides.yaml
new file mode 100644
index 0000000..345a897
--- /dev/null
+++ 
b/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-config-with-overrides.yaml
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+  name: testJobName
+  job_type: testJobType
+  framework: mxnet
+
+configs:
+  input_path: testInputPath
+  checkpoint_path: testCheckpointPath
+  saved_model_path: testSavedModelPath
+  docker_image: testDockerImage
+  wait_job_finish: true
+  envs:
+    env1: 'env1Value'
+    env2: 'env2Value'
+  localizations:
+    - hdfs://remote-file1:/local-filename1:rw
+    - nfs://remote-file2:/local-filename2:rw
+  mounts:
+    - /etc/passwd:/etc/passwd:rw
+    - /etc/hosts:/etc/hosts:rw
+  quicklinks:
+    - Notebook_UI=https://master-0:7070
+    - Notebook_UI2=https://master-0:7071
+
+scheduling:
+  queue: queue1
+
+roles:
+  worker:
+    resources: memory=20480M,vcores=32
+    replicas: 3
+    launch_cmd: overridden_testLaunchCmdWorker
+    docker_image: overridden_testDockerImageWorker
+    envs:
+      env1: 'overridden_env1Worker'
+      env2: 'overridden_env2Worker'
+    localizations:
+      - hdfs://remote-file1:/overridden_local-filename1Worker:rw
+      - nfs://remote-file2:/overridden_local-filename2Worker:rw
+    mounts:
+      - /etc/passwd:/overridden_Worker
+      - /etc/hosts:/overridden_Worker
+  ps:
+    resources: memory=20500M,vcores=34
+    replicas: 4
+    launch_cmd: overridden_testLaunchCmdPs
+    docker_image: overridden_testDockerImagePs
+    envs:
+      env1: 'overridden_env1Ps'
+      env2: 'overridden_env2Ps'
+    localizations:
+      - hdfs://remote-file1:/overridden_local-filename1Ps:rw
+      - nfs://remote-file2:/overridden_local-filename2Ps:rw
+    mounts:
+      - /etc/passwd:/overridden_Ps
+      - /etc/hosts:/overridden_Ps
+  scheduler:
+    resources: memory=10240M,vcores=16
+    replicas: 1
+    launch_cmd: overridden_testLaunchCmdScheduler
+    docker_image: overridden_testDockerImageScheduler
+    envs:
+      env1: 'overridden_env1Scheduler'
+      env2: 'overridden_env2Scheduler'
+    localizations:
+      - hdfs://remote-file1:/overridden_local-filename1Scheduler:rw
+      - nfs://remote-file2:/overridden_local-filename2Scheduler:rw
+    mounts:
+      - /etc/passwd:/overridden_Scheduler
+      - /etc/hosts:/overridden_Scheduler
+
+security:
+  keytab: keytabPath
+  principal: testPrincipal
+  distribute_keytab: true
\ No newline at end of file
diff --git 
a/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-config.yaml 
b/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-config.yaml
new file mode 100644
index 0000000..23a5dbc
--- /dev/null
+++ b/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-config.yaml
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+  name: testJobName
+  job_type: testJobType
+  framework: mxnet
+
+configs:
+  input_path: testInputPath
+  checkpoint_path: testCheckpointPath
+  saved_model_path: testSavedModelPath
+  docker_image: testDockerImage
+  wait_job_finish: true
+  envs:
+    env1: 'env1Value'
+    env2: 'env2Value'
+  localizations:
+    - hdfs://remote-file1:/local-filename1:rw
+    - nfs://remote-file2:/local-filename2:rw
+  mounts:
+    - /etc/passwd:/etc/passwd:rw
+    - /etc/hosts:/etc/hosts:rw
+  quicklinks:
+    - Notebook_UI=https://master-0:7070
+    - Notebook_UI2=https://master-0:7071
+
+scheduling:
+  queue: queue1
+
+roles:
+  worker:
+    resources: memory=20480M,vcores=32
+    replicas: 3
+    launch_cmd: testLaunchCmdWorker
+    docker_image: testDockerImageWorker
+  ps:
+    resources: memory=20500M,vcores=34
+    replicas: 4
+    launch_cmd: testLaunchCmdPs
+    docker_image: testDockerImagePs
+  scheduler:
+    resources: memory=10240M,vcores=16
+    replicas: 1
+    launch_cmd: testLaunchCmdScheduler
+    docker_image: testDockerImageScheduler
+
+security:
+  keytab: keytabPath
+  principal: testPrincipal
+  distribute_keytab: true
\ No newline at end of file
diff --git 
a/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-gpu-config.yaml 
b/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-gpu-config.yaml
new file mode 100644
index 0000000..50a27d4
--- /dev/null
+++ 
b/submarine-client/src/test/resources/runjob-mxnet-yaml/valid-gpu-config.yaml
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+  name: testJobName
+  job_type: testJobType
+  framework: mxnet
+
+configs:
+  input_path: testInputPath
+  checkpoint_path: testCheckpointPath
+  saved_model_path: testSavedModelPath
+  docker_image: testDockerImage
+  wait_job_finish: true
+  envs:
+    env1: 'env1Value'
+    env2: 'env2Value'
+  localizations:
+    - hdfs://remote-file1:/local-filename1:rw
+    - nfs://remote-file2:/local-filename2:rw
+  mounts:
+    - /etc/passwd:/etc/passwd:rw
+    - /etc/hosts:/etc/hosts:rw
+  quicklinks:
+    - Notebook_UI=https://master-0:7070
+    - Notebook_UI2=https://master-0:7071
+
+scheduling:
+  queue: queue1
+
+roles:
+  worker:
+    resources: memory=20480M,vcores=32,gpu=2
+    replicas: 3
+    launch_cmd: testLaunchCmdWorker
+    docker_image: testDockerImageWorker
+  ps:
+    resources: memory=20500M,vcores=34
+    replicas: 4
+    launch_cmd: testLaunchCmdPs
+    docker_image: testDockerImagePs
+  scheduler:
+    resources: memory=10240M,vcores=16
+    replicas: 1
+    launch_cmd: testLaunchCmdScheduler
+    docker_image: testDockerImageScheduler
+
+security:
+  keytab: keytabPath
+  principal: testPrincipal
+  distribute_keytab: true
\ No newline at end of file
diff --git 
a/submarine-client/src/test/resources/runjob-pytorch-yaml/invalid-config-scheduler-section.yaml
 
b/submarine-client/src/test/resources/runjob-pytorch-yaml/invalid-config-scheduler-section.yaml
new file mode 100644
index 0000000..3a8e013
--- /dev/null
+++ 
b/submarine-client/src/test/resources/runjob-pytorch-yaml/invalid-config-scheduler-section.yaml
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+  name: testJobName
+  job_type: testJobType
+  framework: pytorch
+
+configs:
+  input_path: testInputPath
+  checkpoint_path: testCheckpointPath
+  saved_model_path: testSavedModelPath
+  docker_image: testDockerImage
+  wait_job_finish: true
+  envs:
+    env1: 'env1Value'
+    env2: 'env2Value'
+  localizations:
+    - hdfs://remote-file1:/local-filename1:rw
+    - nfs://remote-file2:/local-filename2:rw
+  mounts:
+    - /etc/passwd:/etc/passwd:rw
+    - /etc/hosts:/etc/hosts:rw
+  quicklinks:
+    - Notebook_UI=https://master-0:7070
+    - Notebook_UI2=https://master-0:7071
+
+scheduling:
+  queue: queue1
+
+roles:
+  worker:
+    resources: memory=20480M,vcores=32
+    replicas: 2
+    launch_cmd: testLaunchCmdWorker
+    docker_image: testDockerImageWorker
+  scheduler:
+    docker_image: testDockerImageScheduler
+
+security:
+  keytab: keytabPath
+  principal: testPrincipal
+  distribute_keytab: true
\ No newline at end of file
diff --git 
a/submarine-client/src/test/resources/runjob-tensorflow-yaml/invalid-config-scheduler-section.yaml
 
b/submarine-client/src/test/resources/runjob-tensorflow-yaml/invalid-config-scheduler-section.yaml
new file mode 100644
index 0000000..f3df4dd
--- /dev/null
+++ 
b/submarine-client/src/test/resources/runjob-tensorflow-yaml/invalid-config-scheduler-section.yaml
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+spec:
+  name: testJobName
+  job_type: testJobType
+  framework: tensorflow
+
+configs:
+  input_path: testInputPath
+  checkpoint_path: testCheckpointPath
+  saved_model_path: testSavedModelPath
+  docker_image: testDockerImage
+  wait_job_finish: true
+  envs:
+    env1: 'env1Value'
+    env2: 'env2Value'
+  localizations:
+    - hdfs://remote-file1:/local-filename1:rw
+    - nfs://remote-file2:/local-filename2:rw
+  mounts:
+    - /etc/passwd:/etc/passwd:rw
+    - /etc/hosts:/etc/hosts:rw
+  quicklinks:
+    - Notebook_UI=https://master-0:7070
+    - Notebook_UI2=https://master-0:7071
+
+scheduling:
+  queue: queue1
+
+roles:
+  worker:
+    resources: memory=20480M,vcores=32
+    replicas: 2
+    launch_cmd: testLaunchCmdWorker
+    docker_image: testDockerImageWorker
+  ps:
+    resources: memory=20500M,vcores=34
+    replicas: 2
+    launch_cmd: testLaunchCmdPs
+    docker_image: testDockerImagePs
+  scheduler:
+    docker_image: testDockerImageScheduler
+
+security:
+  keytab: keytabPath
+  principal: testPrincipal
+  distribute_keytab: true
\ No newline at end of file
diff --git 
a/submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/Framework.java
 
b/submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/Framework.java
index efe974b..1ca1a96 100644
--- 
a/submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/Framework.java
+++ 
b/submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/Framework.java
@@ -28,7 +28,7 @@ import java.util.stream.Collectors;
  * Represents the type of Machine learning framework to work with.
  */
 public enum Framework {
-  TENSORFLOW(Constants.TENSORFLOW_NAME), PYTORCH(Constants.PYTORCH_NAME);
+  TENSORFLOW(Constants.TENSORFLOW_NAME), PYTORCH(Constants.PYTORCH_NAME), 
MXNET(Constants.MXNET_NAME);
 
   private String value;
 
@@ -58,5 +58,6 @@ public enum Framework {
   private static class Constants {
     static final String TENSORFLOW_NAME = "tensorflow";
     static final String PYTORCH_NAME = "pytorch";
+    static final String MXNET_NAME = "mxnet";
   }
 }
diff --git 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
 
b/submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/api/MXNetRole.java
similarity index 66%
copy from 
submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
copy to 
submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/api/MXNetRole.java
index fab6f31..5c635ce 100644
--- 
a/submarine-client/src/main/java/org/apache/submarine/client/cli/param/yaml/Roles.java
+++ 
b/submarine-commons/commons-runtime/src/main/java/org/apache/submarine/commons/runtime/api/MXNetRole.java
@@ -17,28 +17,30 @@
  * under the License.
  */
 
-package org.apache.submarine.client.cli.param.yaml;
+package org.apache.submarine.commons.runtime.api;
 
 /**
- * This class represents a section of the YAML configuration file.
+ * Enum to represent a MXNet Role.
  */
-public class Roles {
-  private Role worker;
-  private Role ps;
+public enum MXNetRole implements Role {
+  PRIMARY_WORKER("master"),
+  WORKER("worker"),
+  PS("ps"),
+  SCHEDULER("scheduler");
 
-  public Role getWorker() {
-    return worker;
-  }
+  private String compName;
 
-  public void setWorker(Role worker) {
-    this.worker = worker;
+  MXNetRole(String compName) {
+    this.compName = compName;
   }
 
-  public Role getPs() {
-    return ps;
+  @Override
+  public String getComponentName() {
+    return compName;
   }
 
-  public void setPs(Role ps) {
-    this.ps = ps;
+  @Override
+  public String getName() {
+    return name();
   }
 }
diff --git 
a/submarine-server/server-submitter/submitter-yarn/src/main/java/org/apache/submarine/server/submitter/yarn/YarnUtils.java
 
b/submarine-server/server-submitter/submitter-yarn/src/main/java/org/apache/submarine/server/submitter/yarn/YarnUtils.java
index 07a26ed..1b11d5c 100644
--- 
a/submarine-server/server-submitter/submitter-yarn/src/main/java/org/apache/submarine/server/submitter/yarn/YarnUtils.java
+++ 
b/submarine-server/server-submitter/submitter-yarn/src/main/java/org/apache/submarine/server/submitter/yarn/YarnUtils.java
@@ -31,6 +31,7 @@ import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.submarine.client.cli.CliConstants;
 import org.apache.submarine.client.cli.param.Localization;
 import org.apache.submarine.client.cli.param.ParametersHolder;
+import org.apache.submarine.commons.runtime.Framework;
 import org.apache.submarine.commons.runtime.param.Parameter;
 import org.apache.submarine.commons.runtime.resource.ResourceUtils;
 
@@ -54,67 +55,24 @@ public final class YarnUtils {
             parameters.getFramework().getValue());
     tonyConf.setStrings(TonyConfigurationKeys.APPLICATION_NAME,
             parameters.getParameters().getName());
-    tonyConf.setStrings(
-        TonyConfigurationKeys.getInstancesKey(Constants.WORKER_JOB_NAME),
-            parameters.getOptionValue(CliConstants.N_WORKERS));
-    if (parameters.getOptionValue(CliConstants.N_PS) != null) {
-      tonyConf.setStrings(
-              TonyConfigurationKeys.getInstancesKey(Constants.PS_JOB_NAME),
-              parameters.getOptionValue(CliConstants.N_PS));
-    }
-    // Resources for PS & Worker
-    if (parameters.getOptionValue(CliConstants.PS_RES) != null) {
-      Resource psResource = getResource(parameters, CliConstants.PS_RES);
 
-      tonyConf.setInt(
-          TonyConfigurationKeys.getResourceKey(Constants.PS_JOB_NAME,
-              Constants.VCORES),
-              psResource.getVirtualCores());
-      tonyConf.setLong(
-          TonyConfigurationKeys.getResourceKey(Constants.PS_JOB_NAME,
-              Constants.MEMORY),
-          ResourceUtils.getMemorySize(psResource));
+    setParametersForWorker(tonyConf, parameters);
+    setParametersForPS(tonyConf, parameters);
+    if (parameters.getFramework() == Framework.MXNET) {
+      setParametersForScheduler(tonyConf, parameters);
     }
-    if (parameters.getOptionValue(CliConstants.WORKER_RES) != null) {
-      Resource workerResource = getResource(parameters, 
CliConstants.WORKER_RES);
 
-      tonyConf.setInt(
-          TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
-              Constants.VCORES),
-              workerResource.getVirtualCores());
-      tonyConf.setLong(
-          TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
-              Constants.MEMORY),
-          ResourceUtils.getMemorySize(workerResource));
-      tonyConf.setLong(
-          TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
-              Constants.GPUS),
-          ResourceUtils.getResourceValue(workerResource,
-              ResourceUtils.GPU_URI));
-    }
     if (parameters.getOptionValue(CliConstants.QUEUE) != null) {
       tonyConf.set(
           TonyConfigurationKeys.YARN_QUEUE_NAME,
               parameters.getOptionValue(CliConstants.QUEUE));
     }
-    // Set up Docker for PS & Worker
+
     if (parameters.getOptionValue(CliConstants.DOCKER_IMAGE) != null) {
       tonyConf.set(TonyConfigurationKeys.getContainerDockerKey(),
               parameters.getOptionValue(CliConstants.DOCKER_IMAGE));
       tonyConf.setBoolean(TonyConfigurationKeys.DOCKER_ENABLED, true);
     }
-    if (parameters.getOptionValue(CliConstants.WORKER_DOCKER_IMAGE) != null) {
-      tonyConf.set(
-          TonyConfigurationKeys.getDockerImageKey(Constants.WORKER_JOB_NAME),
-              parameters.getOptionValue(CliConstants.WORKER_DOCKER_IMAGE));
-      tonyConf.setBoolean(TonyConfigurationKeys.DOCKER_ENABLED, true);
-    }
-    if (parameters.getOptionValue(CliConstants.PS_DOCKER_IMAGE) != null) {
-      tonyConf.set(
-          TonyConfigurationKeys.getDockerImageKey(Constants.PS_JOB_NAME),
-              parameters.getOptionValue(CliConstants.PS_DOCKER_IMAGE));
-      tonyConf.setBoolean(TonyConfigurationKeys.DOCKER_ENABLED, true);
-    }
 
     // Set up container environment
     if (parameters.getOptionValues(CliConstants.ENV) != null) {
@@ -132,18 +90,6 @@ public final class YarnUtils {
     }
     // Update after SUBMARINE-104 is merged into tony.
     // tonyConf.setStrings(TonyConfigurationKeys.APPLICATION_TYPE, 
SUBMARINE_RUNTIME_APP_TYPE);
-    // Set up running command
-    if (parameters.getOptionValue(CliConstants.WORKER_LAUNCH_CMD) != null) {
-      tonyConf.set(
-          
TonyConfigurationKeys.getExecuteCommandKey(Constants.WORKER_JOB_NAME),
-              parameters.getOptionValue(CliConstants.WORKER_LAUNCH_CMD));
-    }
-
-    if (parameters.getOptionValue(CliConstants.PS_LAUNCH_CMD) != null) {
-      tonyConf.set(
-          TonyConfigurationKeys.getExecuteCommandKey(Constants.PS_JOB_NAME),
-              parameters.getOptionValue(CliConstants.PS_LAUNCH_CMD));
-    }
 
     tonyConf.setBoolean(TonyConfigurationKeys.SECURITY_ENABLED,
         !parameters.hasOption(CliConstants.INSECURE_CLUSTER));
@@ -203,4 +149,113 @@ public final class YarnUtils {
     }
     return ResourceUtils.createResourceFromString(resourceStr);
   }
+
+  private static void setParametersForWorker (Configuration tonyConf,
+          ParametersHolder parameters) throws YarnException, ParseException {
+    tonyConf.setStrings(
+            TonyConfigurationKeys.getInstancesKey(Constants.WORKER_JOB_NAME),
+            parameters.getOptionValue(CliConstants.N_WORKERS));
+
+    if (parameters.getOptionValue(CliConstants.WORKER_RES) != null) {
+      Resource workerResource = getResource(parameters, 
CliConstants.WORKER_RES);
+
+      tonyConf.setInt(
+              TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
+                      Constants.VCORES),
+              workerResource.getVirtualCores());
+      tonyConf.setLong(
+              TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
+                      Constants.MEMORY),
+              ResourceUtils.getMemorySize(workerResource));
+      tonyConf.setLong(
+              TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
+                      Constants.GPUS),
+              ResourceUtils.getResourceValue(workerResource,
+                      ResourceUtils.GPU_URI));
+    }
+
+    if (parameters.getOptionValue(CliConstants.WORKER_DOCKER_IMAGE) != null) {
+      tonyConf.set(
+              
TonyConfigurationKeys.getDockerImageKey(Constants.WORKER_JOB_NAME),
+              parameters.getOptionValue(CliConstants.WORKER_DOCKER_IMAGE));
+      tonyConf.setBoolean(TonyConfigurationKeys.DOCKER_ENABLED, true);
+    }
+
+    if (parameters.getOptionValue(CliConstants.WORKER_LAUNCH_CMD) != null) {
+      tonyConf.set(
+              
TonyConfigurationKeys.getExecuteCommandKey(Constants.WORKER_JOB_NAME),
+              parameters.getOptionValue(CliConstants.WORKER_LAUNCH_CMD));
+    }
+  }
+
+  private static void setParametersForPS (Configuration tonyConf,
+          ParametersHolder parameters) throws YarnException, ParseException {
+    String jobName = Constants.PS_JOB_NAME;
+    if (parameters.getFramework() == Framework.MXNET) {
+      jobName = Constants.SERVER_JOB_NAME;
+    }
+
+    if (parameters.getOptionValue(CliConstants.N_PS) != null) {
+      tonyConf.setStrings(
+              TonyConfigurationKeys.getInstancesKey(jobName),
+              parameters.getOptionValue(CliConstants.N_PS));
+    }
+    if (parameters.getOptionValue(CliConstants.PS_RES) != null) {
+      Resource psResource = getResource(parameters, CliConstants.PS_RES);
+
+      tonyConf.setInt(
+              TonyConfigurationKeys.getResourceKey(jobName,
+                      Constants.VCORES),
+              psResource.getVirtualCores());
+      tonyConf.setLong(
+              TonyConfigurationKeys.getResourceKey(jobName,
+                      Constants.MEMORY),
+              ResourceUtils.getMemorySize(psResource));
+    }
+
+    if (parameters.getOptionValue(CliConstants.PS_LAUNCH_CMD) != null) {
+      tonyConf.set(
+              TonyConfigurationKeys.getExecuteCommandKey(jobName),
+              parameters.getOptionValue(CliConstants.PS_LAUNCH_CMD));
+    }
+
+    if (parameters.getOptionValue(CliConstants.PS_DOCKER_IMAGE) != null) {
+      tonyConf.set(
+              TonyConfigurationKeys.getDockerImageKey(jobName),
+              parameters.getOptionValue(CliConstants.PS_DOCKER_IMAGE));
+      tonyConf.setBoolean(TonyConfigurationKeys.DOCKER_ENABLED, true);
+    }
+  }
+
+  private static void setParametersForScheduler (Configuration tonyConf,
+          ParametersHolder parameters) throws YarnException, ParseException {
+    if (parameters.getOptionValue(CliConstants.N_SCHEDULERS) != null) {
+      tonyConf.setStrings(
+              
TonyConfigurationKeys.getInstancesKey(Constants.SCHEDULER_JOB_NAME),
+              parameters.getOptionValue(CliConstants.N_SCHEDULERS));
+    }
+
+    if (parameters.getOptionValue(CliConstants.SCHEDULER_RES) != null) {
+      Resource schedulerResource = getResource(parameters, 
CliConstants.SCHEDULER_RES);
+
+      tonyConf.setInt(
+              
TonyConfigurationKeys.getResourceKey(Constants.SCHEDULER_JOB_NAME,
+                      Constants.VCORES),
+              schedulerResource.getVirtualCores());
+      tonyConf.setLong(
+              
TonyConfigurationKeys.getResourceKey(Constants.SCHEDULER_JOB_NAME,
+                      Constants.MEMORY),
+              ResourceUtils.getMemorySize(schedulerResource));
+    }
+    if (parameters.getOptionValue(CliConstants.SCHEDULER_LAUNCH_CMD) != null) {
+      tonyConf.set(
+              
TonyConfigurationKeys.getExecuteCommandKey(Constants.SCHEDULER_JOB_NAME),
+              parameters.getOptionValue(CliConstants.SCHEDULER_LAUNCH_CMD));
+    }
+    if (parameters.getOptionValue(CliConstants.SCHEDULER_DOCKER_IMAGE) != 
null) {
+      tonyConf.set(
+              
TonyConfigurationKeys.getDockerImageKey(Constants.SCHEDULER_JOB_NAME),
+              parameters.getOptionValue(CliConstants.SCHEDULER_DOCKER_IMAGE));
+    }
+  }
 }
diff --git 
a/submarine-server/server-submitter/submitter-yarn/src/test/java/YarnUtilsTest.java
 
b/submarine-server/server-submitter/submitter-yarn/src/test/java/YarnUtilsTest.java
index e919e41..f360abe 100644
--- 
a/submarine-server/server-submitter/submitter-yarn/src/test/java/YarnUtilsTest.java
+++ 
b/submarine-server/server-submitter/submitter-yarn/src/test/java/YarnUtilsTest.java
@@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.submarine.client.cli.param.ParametersHolder;
+import org.apache.submarine.client.cli.param.runjob.MXNetRunJobParameters;
 import org.apache.submarine.client.cli.param.runjob.PyTorchRunJobParameters;
 import org.apache.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
 import org.apache.submarine.client.cli.runjob.RunJobCli;
@@ -159,4 +160,62 @@ public class YarnUtilsTest {
     // Update after SUBMARINE-104 is merged into tony.
     // Assert.assertEquals("SUBMARINE", 
tonyConf.get(TonyConfigurationKeys.APPLICATION_TYPE));
   }
+
+  @Test
+  public void testMXNetTonyConfFromClientContext() throws Exception {
+    RunJobCli runJobCli = new RunJobCli(getMockClientContext());
+    runJobCli.run(
+        new String[] {"--framework", "mxnet", "--name", "my-job",
+            "--docker_image", "mxnet-docker:1.1.0",
+            "--input_path", "hdfs://input",
+            "--num_workers", "2", "--num_ps", "2", "--worker_launch_cmd",
+            "python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
+            "--ps_resources", "memory=4G,vcores=4", "--ps_launch_cmd",
+            "python run-ps.py", "--num_schedulers", "1", 
"--scheduler_launch_cmd",
+            "python run-scheduler.py", "--scheduler_resources", 
"memory=2048M,vcores=2"});
+    RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
+    ParametersHolder parametersHolder = runJobCli.getParametersHolder();
+
+    assertTrue(RunJobParameters.class + " must be an instance of " +
+                    MXNetRunJobParameters.class,
+            jobRunParameters instanceof MXNetRunJobParameters);
+    MXNetRunJobParameters mxNetParams = (MXNetRunJobParameters) 
jobRunParameters;
+
+    Configuration tonyConf = YarnUtils
+            .tonyConfFromClientContext(parametersHolder);
+    Assert.assertEquals(parametersHolder.getFramework().getValue(),
+            tonyConf.get(TonyConfigurationKeys.FRAMEWORK_NAME));
+    Assert.assertEquals(jobRunParameters.getName(),
+            tonyConf.get(TonyConfigurationKeys.APPLICATION_NAME));
+    Assert.assertEquals(jobRunParameters.getDockerImageName(),
+            tonyConf.get(TonyConfigurationKeys.getContainerDockerKey()));
+
+    Assert.assertEquals("2", tonyConf.get(TonyConfigurationKeys
+            .getInstancesKey("worker")));
+    Assert.assertEquals(mxNetParams.getWorkerLaunchCmd(),
+            tonyConf.get(TonyConfigurationKeys
+                    .getExecuteCommandKey("worker")));
+    Assert.assertEquals("2048", tonyConf.get(TonyConfigurationKeys
+            .getResourceKey(Constants.WORKER_JOB_NAME, Constants.MEMORY)));
+    Assert.assertEquals("2", tonyConf.get(TonyConfigurationKeys
+            .getResourceKey(Constants.WORKER_JOB_NAME, Constants.VCORES)));
+    Assert.assertEquals("2",
+            
tonyConf.get(TonyConfigurationKeys.getInstancesKey(Constants.SERVER_JOB_NAME)));
+    Assert.assertEquals("4096", tonyConf.get(TonyConfigurationKeys
+            .getResourceKey(Constants.SERVER_JOB_NAME, Constants.MEMORY)));
+    Assert.assertEquals("4", tonyConf.get(TonyConfigurationKeys
+            .getResourceKey(Constants.SERVER_JOB_NAME, Constants.VCORES)));
+    Assert.assertEquals(mxNetParams.getPSLaunchCmd(),
+            
tonyConf.get(TonyConfigurationKeys.getExecuteCommandKey(Constants.SERVER_JOB_NAME)));
+    Assert.assertEquals("1",
+            
tonyConf.get(TonyConfigurationKeys.getInstancesKey(Constants.SCHEDULER_JOB_NAME)));
+    Assert.assertEquals("2048", tonyConf.get(TonyConfigurationKeys
+            .getResourceKey(Constants.SCHEDULER_JOB_NAME, Constants.MEMORY)));
+    Assert.assertEquals("2", tonyConf.get(TonyConfigurationKeys
+            .getResourceKey(Constants.SCHEDULER_JOB_NAME, Constants.VCORES)));
+    Assert.assertEquals(mxNetParams.getSchedulerLaunchCmd(),
+            
tonyConf.get(TonyConfigurationKeys.getExecuteCommandKey(Constants.SCHEDULER_JOB_NAME)));
+    // Update after SUBMARINE-104 is merged into tony.
+    //Assert.assertEquals("SUBMARINE", 
tonyConf.get(TonyConfigurationKeys.APPLICATION_TYPE));
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to