This is an automated email from the ASF dual-hosted git repository.

ztang pushed a commit to annotated tag release-0.3.0-RC0
in repository https://gitbox.apache.org/repos/asf/submarine.git

commit 6c51bab81f033ef5e42d2017ec1bf452d08a5acf
Author: Wanqiang Ji <[email protected]>
AuthorDate: Sun Jan 19 12:34:33 2020 +0800

    SUBMARINE-347. Fix the job spec parser issue and refine the TF job on K8s 
document
    
    ### What is this PR for?
    Fix parser JobSpec to TFJob issue
    Refine the doc of submarine server
    
    ### What type of PR is it?
    [Bug Fix | Documentation]
    
    ### Todos
    
    ### What is the Jira issue?
    https://issues.apache.org/jira/browse/SUBMARINE-347
    
    ### How should this be tested?
    https://travis-ci.com/jiwq/submarine/builds/144941387
    
    ### Screenshots (if appropriate)
    
    ### Questions:
    * Does the licenses files need update? No
    * Is there breaking changes for older versions? No
    * Does this needs documentation? No
    
    Author: Wanqiang Ji <[email protected]>
    
    Closes #153 from jiwq/SUBMARINE-347 and squashes the following commits:
    
    f7c2ccc [Wanqiang Ji] SUBMARINE-347. Fix the job spec parser issue and 
refine the TF job on K8s document
    
    (cherry picked from commit 3ea8a9f1203f553c334751a646b26eeb9ae48c48)
---
 docs/submarine-server/README.md                    | 29 +++++++++++++++++++++-
 docs/submarine-server/ml-frameworks/tensorflow.md  |  2 ++
 .../server/submitter/k8s/parser/JobSpecParser.java |  3 ++-
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/docs/submarine-server/README.md b/docs/submarine-server/README.md
index 30236ed..ed1a77f 100644
--- a/docs/submarine-server/README.md
+++ b/docs/submarine-server/README.md
@@ -91,9 +91,36 @@ or
 For more info see [here](../design/submarine-server/jobspec.md).
 
 ### Submit Job
+> Before submit training job, you should make sure you had deployed the 
[submarine server and tf-operator](./setup-kubernetes.md#setup-submarine).
+
 You can use the Postman post the job to server or use `curl` run following 
command:
 ```
 curl -H "Content-Type: application/json" --request POST \
---data 
`{"name":"mnist","librarySpec":{"name":"TensorFlow","version":"2.1.0","image":"gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0","cmd":"python
 /var/tf_mnist/mnist_with_summaries.py --log_dir=/train/log 
--learning_rate=0.01 
--batch_size=150","envVars":{"ENV_1":"ENV1"}},"submitterSpec":{"type":"k8s","configPath":null,"namespace":"submarine","kind":"TFJob","apiVersion":"kubeflow.org/v1"},"taskSpecs":{"Ps":{"name":"tensorflow","replicas":2,"resources":"cpu=4,memory=2048M,nvidia.com/gpu=
 [...]
+--data 
'{"name":"mnist","librarySpec":{"name":"TensorFlow","version":"2.1.0","image":"gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0","cmd":"python
 /var/tf_mnist/mnist_with_summaries.py --log_dir=/train/log 
--learning_rate=0.01 
--batch_size=150","envVars":{"ENV_1":"ENV1"}},"submitterSpec":{"type":"k8s","configPath":null,"namespace":"submarine","kind":"TFJob","apiVersion":"kubeflow.org/v1"},"taskSpecs":{"Ps":{"name":"tensorflow","replicas":1,"resources":"cpu=1,memory=1024M"},"Worker":{"na
 [...]
 http://127.0.0.1:8080/api/v1/jobs
 ```
+
+### Verify Jobs
+You can run following command to get the submitted job:
+```
+kubectl get -n submarine tfjob
+```
+
+**Output:**
+```
+NAME    STATE     AGE
+mnist   Created   7m6s
+```
+
+Also you can find pods which running the jobs, run following command:
+```
+kubectl get -n submarine pods
+```
+
+**Output:**
+```
+NAME                               READY   STATUS              RESTARTS   AGE
+mnist-ps-0                         0/1     ContainerCreating   0          3m47s
+mnist-worker-0                     0/1     Pending             0          3m47s
+tf-job-operator-74cc6bd6cb-fqd5s   1/1     Running             0          98m
+```
diff --git a/docs/submarine-server/ml-frameworks/tensorflow.md 
b/docs/submarine-server/ml-frameworks/tensorflow.md
index c03a703..30c4ef3 100644
--- a/docs/submarine-server/ml-frameworks/tensorflow.md
+++ b/docs/submarine-server/ml-frameworks/tensorflow.md
@@ -23,6 +23,8 @@ under the License.
 We support Tensorflow job on kubernetes by using the tf-operator as a runtime. 
For more info about tf-operator see 
[here](https://github.com/kubeflow/tf-operator).
 
 ### Deploy tf-operator
+> If you don't have the `submarine` namespace on your K8s cluster, you should 
create it first. Run command: `kubectl create namespace submarine`
+
 Running the follow commands:
 ```
 kubectl apply -f ./dev-support/k8s/tfjob/crd.yaml
diff --git 
a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/parser/JobSpecParser.java
 
b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/parser/JobSpecParser.java
index 2b4f633..13c1d4a 100644
--- 
a/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/parser/JobSpecParser.java
+++ 
b/submarine-server/server-submitter/submitter-k8s/src/main/java/org/apache/submarine/server/submitter/k8s/parser/JobSpecParser.java
@@ -67,7 +67,7 @@ public class JobSpecParser {
       TFReplicaSpec spec = new TFReplicaSpec();
       spec.setReplicas(entry.getValue().getReplicas());
       spec.setTemplate(parseTemplateSpec(entry.getValue(), 
jobSpec.getLibrarySpec()));
-      replicaSpecMap.put(entry.getValue().getName(), spec);
+      replicaSpecMap.put(entry.getKey(), spec);
     }
     tfJobSpec.setTfReplicaSpecs(replicaSpecMap);
     return tfJobSpec;
@@ -96,6 +96,7 @@ public class JobSpecParser {
     resources.setLimits(parseResources(taskSpec));
     container.setResources(resources);
     container.setEnv(parseEnvVars(taskSpec, libSpec.getEnvVars()));
+    containers.add(container);
     podSpec.setContainers(containers);
     templateSpec.setSpec(podSpec);
     return templateSpec;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to