This is an automated email from the ASF dual-hosted git repository. ztang pushed a commit to annotated tag release-0.3.0-RC0 in repository https://gitbox.apache.org/repos/asf/submarine.git
commit 6c51bab81f033ef5e42d2017ec1bf452d08a5acf Author: Wanqiang Ji <[email protected]> AuthorDate: Sun Jan 19 12:34:33 2020 +0800 SUBMARINE-347. Fix the job spec parser issue and refine the TF job on K8s document ### What is this PR for? Fix parser JobSpec to TFJob issue Refine the doc of submarine server ### What type of PR is it? [Bug Fix | Documentation] ### Todos ### What is the Jira issue? https://issues.apache.org/jira/browse/SUBMARINE-347 ### How should this be tested? https://travis-ci.com/jiwq/submarine/builds/144941387 ### Screenshots (if appropriate) ### Questions: * Does the licenses files need update? No * Is there breaking changes for older versions? No * Does this needs documentation? No Author: Wanqiang Ji <[email protected]> Closes #153 from jiwq/SUBMARINE-347 and squashes the following commits: f7c2ccc [Wanqiang Ji] SUBMARINE-347. Fix the job spec parser issue and refine the TF job on K8s document (cherry picked from commit 3ea8a9f1203f553c334751a646b26eeb9ae48c48) --- docs/submarine-server/README.md | 29 +++++++++++++++++++++- docs/submarine-server/ml-frameworks/tensorflow.md | 2 ++ .../server/submitter/k8s/parser/JobSpecParser.java | 3 ++- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/docs/submarine-server/README.md b/docs/submarine-server/README.md index 30236ed..ed1a77f 100644 --- a/docs/submarine-server/README.md +++ b/docs/submarine-server/README.md @@ -91,9 +91,36 @@ or For more info see [here](../design/submarine-server/jobspec.md). ### Submit Job +> Before submit training job, you should make sure you had deployed the [submarine server and tf-operator](./setup-kubernetes.md#setup-submarine). + You can use the Postman post the job to server or use `curl` run following command: ``` curl -H "Content-Type: application/json" --request POST \ ---data `{"name":"mnist","librarySpec":{"name":"TensorFlow","version":"2.1.0","image":"gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0","cmd":"python /var/tf_mnist/mnist_with_summaries.py --log_dir=/train/log --learning_rate=0.01 --batch_size=150","envVars":{"ENV_1":"ENV1"}},"submitterSpec":{"type":"k8s","configPath":null,"namespace":"submarine","kind":"TFJob","apiVersion":"kubeflow.org/v1"},"taskSpecs":{"Ps":{"name":"tensorflow","replicas":2,"resources":"cpu=4,memory=2048M,nvidia.com/gpu= [...] +--data '{"name":"mnist","librarySpec":{"name":"TensorFlow","version":"2.1.0","image":"gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0","cmd":"python /var/tf_mnist/mnist_with_summaries.py --log_dir=/train/log --learning_rate=0.01 --batch_size=150","envVars":{"ENV_1":"ENV1"}},"submitterSpec":{"type":"k8s","configPath":null,"namespace":"submarine","kind":"TFJob","apiVersion":"kubeflow.org/v1"},"taskSpecs":{"Ps":{"name":"tensorflow","replicas":1,"resources":"cpu=1,memory=1024M"},"Worker":{"na [...] http://127.0.0.1:8080/api/v1/jobs ``` + +### Verify Jobs +You can run following command to get the submitted job: +``` +kubectl get -n submarine tfjob +``` + +**Output:** +``` +NAME STATE AGE +mnist Created 7m6s +``` + +Also you can find pods which running the jobs, run following command: +``` +kubectl get -n submarine pods +``` + +**Output:** +``` +NAME READY STATUS RESTARTS AGE +mnist-ps-0 0/1 ContainerCreating 0 3m47s +mnist-worker-0 0/1 Pending 0 3m47s +tf-job-operator-74cc6bd6cb-fqd5s 1/1 Running 0 98m +``` diff --git a/docs/submarine-server/ml-frameworks/tensorflow.md b/docs/submarine-server/ml-frameworks/tensorflow.md index c03a703..30c4ef3 100644 --- a/docs/submarine-server/ml-frameworks/tensorflow.md +++ b/docs/submarine-server/ml-frameworks/tensorflow.md @@ -23,6 +23,8 @@ under the License. We support Tensorflow job on kubernetes by using the tf-operator as a runtime. For more info about tf-operator see [here](https://github.com/kubeflow/tf-operator). ### Deploy tf-operator +> If you don't have the `submarine` namespace on your K8s cluster, you should create it first. Run command: `kubectl create namespace submarine` + Running the follow commands: ``` kubectl apply -f ./dev-support/k8s/tfjob/crd.yaml diff --git a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/parser/JobSpecParser.java b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/parser/JobSpecParser.java index 2b4f633..13c1d4a 100644 --- a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/parser/JobSpecParser.java +++ b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/parser/JobSpecParser.java @@ -67,7 +67,7 @@ public class JobSpecParser { TFReplicaSpec spec = new TFReplicaSpec(); spec.setReplicas(entry.getValue().getReplicas()); spec.setTemplate(parseTemplateSpec(entry.getValue(), jobSpec.getLibrarySpec())); - replicaSpecMap.put(entry.getValue().getName(), spec); + replicaSpecMap.put(entry.getKey(), spec); } tfJobSpec.setTfReplicaSpecs(replicaSpecMap); return tfJobSpec; @@ -96,6 +96,7 @@ public class JobSpecParser { resources.setLimits(parseResources(taskSpec)); container.setResources(resources); container.setEnv(parseEnvVars(taskSpec, libSpec.getEnvVars())); + containers.add(container); podSpec.setContainers(containers); templateSpec.setSpec(podSpec); return templateSpec; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
