Repository: incubator-singa
Updated Branches:
  refs/heads/master f234c4972 -> 7d9c0fb4b


SINGA-26 Run distributed training in a single command

To run singa in distributed model in a single command, you just use command:

    $ ./bin/singa-run.sh -conf=YOUR_CONF_DIR

The provided dir should contain 3 files:
  * model.conf
  * cluster.conf
  * hostfile

Please note that the $zookeeper_host in the cluster.conf should be set as the
host executing this script.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/903e0362
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/903e0362
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/903e0362

Branch: refs/heads/master
Commit: 903e0362fcff1edfafa89f1dd78917d39a3af0d3
Parents: f234c49
Author: wang sheng <[email protected]>
Authored: Sun Jun 28 03:00:56 2015 +0800
Committer: wang sheng <[email protected]>
Committed: Sun Jun 28 03:00:56 2015 +0800

----------------------------------------------------------------------
 bin/singa-cleanup.sh   | 43 ------------------------
 bin/singa-run.sh       | 80 ++++++++++++++++++++++++++++++++++++---------
 bin/singa-stop.sh      | 67 +++++++++++++++++++++++++++++++++++++
 bin/zk-service.sh      | 28 +++++++---------
 src/trainer/trainer.cc |  2 +-
 5 files changed, 145 insertions(+), 75 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/903e0362/bin/singa-cleanup.sh
----------------------------------------------------------------------
diff --git a/bin/singa-cleanup.sh b/bin/singa-cleanup.sh
deleted file mode 100755
index f94c9db..0000000
--- a/bin/singa-cleanup.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env bash
-#
-#/**
-# * Copyright 2015 The Apache Software Foundation
-# *
-# * Licensed to the Apache Software Foundation (ASF) under one
-# * or more contributor license agreements.  See the NOTICE file
-# * distributed with this work for additional information
-# * regarding copyright ownership.  The ASF licenses this file
-# * to you under the Apache License, Version 2.0 (the
-# * "License"); you may not use this file except in compliance
-# * with the License.  You may obtain a copy of the License at
-# *
-# *     http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-# 
-# Manage a zookeeper service
-#
-
-usage="Usage: singa-cleanup.sh"
-
-#if [ $# -le 0 ]; then
-#  echo $usage
-#  exit 1
-#fi
-
-BIN=`dirname "${BASH_SOURCE-$0}"`
-BIN=`cd "$BIN">/dev/null; pwd`
-BASE=`cd "$BIN/..">/dev/null; pwd`
-ZKDATADIR="/tmp/zookeeper"
-
-. $BIN/zk-service.sh stop 2>/dev/null
-
-echo cleanning data in zookeeper...
-#remove zk data
-rm -r $ZKDATADIR
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/903e0362/bin/singa-run.sh
----------------------------------------------------------------------
diff --git a/bin/singa-run.sh b/bin/singa-run.sh
index c911ec3..b69fe5d 100755
--- a/bin/singa-run.sh
+++ b/bin/singa-run.sh
@@ -23,34 +23,84 @@
 # Run a Singa job
 #
 
-usage="Usage: singa-run.sh"
+usage="Usage: \n \
+  (single node): singa-run.sh -cluster=YOUR_CONF_FILE -model=YOUR_CONF_FILE \n 
\
+  (distributed): singa-run.sh -conf=YOUR_CONF_DIR \ 
+  (the directory should contain cluster.conf/model.conf/hostfile)"
 
-#if [ $# -le 0 ]; then
-#  echo $usage
+#if [ $# -le 0 ] || [ $# -ge 3 ] ; then
+#  echo -e $usage
 #  exit 1
 #fi
 
+valid_args=false
+
+if [ $# = 1 ] ; then
+  if [[ $1 = "-conf="* ]] ; then
+    valid_args=true
+    conf_path=${1:6}
+    host_path=$conf_path/hostfile
+  fi
+elif [ $# = 2 ] ; then
+  if [[ $1 = "-cluster="* ]] && [[ $2 = "-model="*  ]] ; then
+    valid_args=true
+  elif [[ $2 = "-cluster="* ]] && [[ $1 = "-model="*  ]] ; then
+    valid_args=true
+  fi
+fi
+
+if [ $valid_args = false ] ; then
+  echo -e $usage
+  exit 1 
+fi
+
+# get singa-base
 BIN=`dirname "${BASH_SOURCE-$0}"`
 BIN=`cd "$BIN">/dev/null; pwd`
 BASE=`cd "$BIN/..">/dev/null; pwd`
 
 cd $BASE
 
-#cleanup singa data
-. $BIN/singa-cleanup.sh
+# clenup singa data
+if [ -z $host_path ] ; then
+  . $BIN/singa-stop.sh
+else
+  . $BIN/singa-stop.sh $host_path
+fi
 
-#start zookeeper
+# start zookeeper
 . $BIN/zk-service.sh start 2>/dev/null
 
-#wait for zk service to be up
+# wait for zk service to be up
 sleep 3
 
-#run singa
-cmd="./singa "$@
-echo starting singa ...
-echo executing: $cmd
-exec $cmd
+# check mode
+if [ $# = 2 ] ; then
+  # start singa process
+  cmd="./singa "$@
+  echo starting singa ...
+  echo executing : $cmd
+  exec $cmd
+elif [ $# = 1 ] ; then
+  # ssh and start singa processes
+  ssh_options="-oStrictHostKeyChecking=no \
+  -oUserKnownHostsFile=/dev/null \
+  -oLogLevel=quiet"
+  hosts=(`cat $host_path |cut -d ' ' -f 1`)
+  for i in ${hosts[@]} ; do
+    cmd="cd $BASE; \
+        ./singa \
+        -cluster=$conf_path/cluster.conf \
+        -model=$conf_path/model.conf"
+    echo executing @ $i : $cmd
+    ssh $ssh_options $i $cmd &
+  done
+  wait
+fi
 
-#stop zookeeper
-echo stopping singa ...
-. $BIN/zk-service.sh stop 2>/dev/null
+# cleanup singa data
+if [ -z $host_path ] ; then
+  . $BIN/singa-stop.sh
+else
+  . $BIN/singa-stop.sh $host_path
+fi

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/903e0362/bin/singa-stop.sh
----------------------------------------------------------------------
diff --git a/bin/singa-stop.sh b/bin/singa-stop.sh
new file mode 100755
index 0000000..acded75
--- /dev/null
+++ b/bin/singa-stop.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+#
+#/**
+# * Copyright 2015 The Apache Software Foundation
+# *
+# * Licensed to the Apache Software Foundation (ASF) under one
+# * or more contributor license agreements.  See the NOTICE file
+# * distributed with this work for additional information
+# * regarding copyright ownership.  The ASF licenses this file
+# * to you under the Apache License, Version 2.0 (the
+# * "License"); you may not use this file except in compliance
+# * with the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# 
+# Clean up singa processes and zookeeper metadata
+#
+
+usage="Usage: \n \
+      (local process): singa-stop.sh \n \
+      (distributed): singa-stop.sh HOST_FILE"
+
+if [ $# -gt 1 ]; then
+  echo -e $usage
+  exit 1
+fi
+
+BIN=`dirname "${BASH_SOURCE-$0}"`
+BIN=`cd "$BIN">/dev/null; pwd`
+BASE=`cd "$BIN/..">/dev/null; pwd`
+ZKDATA_DIR="/tmp/zookeeper"
+
+PROC_NAME="lt-singa"
+HOST_FILE=$1
+
+
+# kill singa processes
+if [ $# = 0 ] ; then
+  echo kill singa @ localhost ...
+  cmd="killall -s SIGKILL "$PROC_NAME
+  exec $cmd
+elif [ $# = 1 ] ; then
+  ssh_options="-oStrictHostKeyChecking=no \
+  -oUserKnownHostsFile=/dev/null \
+  -oLogLevel=quiet"
+  hosts=(`cat $HOST_FILE |cut -d ' ' -f 1`)
+  for i in ${hosts[@]} ; do
+    cmd="killall -s SIGKILL "$PROC_NAME
+    echo kill singa @ $i ...
+    ssh $ssh_options $i $cmd
+  done
+fi
+
+# close zookeeper
+. $BIN/zk-service.sh stop 2>/dev/null
+
+echo cleanning metadata in zookeeper ...
+# remove zk data
+rm -r $ZKDATA_DIR
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/903e0362/bin/zk-service.sh
----------------------------------------------------------------------
diff --git a/bin/zk-service.sh b/bin/zk-service.sh
index fc2c99c..1b67c7c 100755
--- a/bin/zk-service.sh
+++ b/bin/zk-service.sh
@@ -23,7 +23,7 @@
 # Manage a zookeeper service
 #
 
-usage="Usage: zk-service.sh (start|stop)"
+usage="Usage: zk-service.sh [start|stop]"
 
 if [ $# -le 0 ]; then
   echo $usage
@@ -35,16 +35,12 @@ BIN=`cd "$BIN">/dev/null; pwd`
 BASE=`cd "$BIN/..">/dev/null; pwd`
 ZKBASE=$BASE/thirdparty/zookeeper-3.4.6
 
-#echo $ZKBASE
-
-if [ "$SINGA_MANAGES_ZK" = "" ]; then
+if [ -z $SINGA_MANAGES_ZK ]; then
   SINGA_MANAGES_ZK=true
 fi
 
-#echo 'SINGA_MANAGES_ZK='$SINGA_MANAGES_ZK
-
-if [ "$SINGA_MANAGES_ZK" = "true" ]; then
-  #check zookeeper installation
+if [ $SINGA_MANAGES_ZK = true ]; then
+  # check zookeeper installation
   if [ ! -d $ZKBASE ]; then
     echo "zookeeper not found, please install zookeeper first:"
     echo "$./SINGA_BASE/thirdparty/install.sh zookeeper"
@@ -52,28 +48,28 @@ if [ "$SINGA_MANAGES_ZK" = "true" ]; then
   fi
 fi
 
-#get argument
+# get argument
 cmd=$1
 
 case $cmd in
 
 (start)
-  #start zk service
-  if [ "$SINGA_MANAGES_ZK" = "true" ]; then
-    #check zoo,cfg
+  # start zk service
+  if [ $SINGA_MANAGES_ZK = true ]; then
+    # check zoo,cfg
     if [ ! -f $ZKBASE/conf/zoo.cfg ]; then
       echo "zoo.cfg not found, create from sample.cfg"
       cp $ZKBASE/conf/zoo_sample.cfg $ZKBASE/conf/zoo.cfg
     fi
-    #echo 'starting zookeeper service...'
+    # echo 'starting zookeeper service...'
     $ZKBASE/bin/zkServer.sh start
   fi
   ;;
 
 (stop)
-  #stop zk service
-  if [ "$SINGA_MANAGES_ZK" = "true" ]; then
-    #echo 'stopping zookeeper service...'
+  # stop zk service
+  if [ $SINGA_MANAGES_ZK = true ]; then
+    # echo 'stopping zookeeper service...'
     $ZKBASE/bin/zkServer.sh stop
   fi
   ;;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/903e0362/src/trainer/trainer.cc
----------------------------------------------------------------------
diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc
index c12ff84..ce135cc 100644
--- a/src/trainer/trainer.cc
+++ b/src/trainer/trainer.cc
@@ -359,7 +359,7 @@ void Trainer::Run(const vector<shared_ptr<Worker>>& workers,
             break;
           }
         }else if(type==kMetric){
-          if(msg->src_first()>=0){
+          if(msg->src_first()==0){
             int step=msg->trgt_first();
             string prefix((char*)msg->frame_data(), msg->frame_size());
             msg->next_frame();

Reply via email to