Repository: incubator-singa
Updated Branches:
refs/heads/master f234c4972 -> 7d9c0fb4b
SINGA-26 Run distributed training in a single command
To run singa in distributed model in a single command, you just use command:
$ ./bin/singa-run.sh -conf=YOUR_CONF_DIR
The provided dir should contain 3 files:
* model.conf
* cluster.conf
* hostfile
Please note that the $zookeeper_host in the cluster.conf should be set as the
host executing this script.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/903e0362
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/903e0362
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/903e0362
Branch: refs/heads/master
Commit: 903e0362fcff1edfafa89f1dd78917d39a3af0d3
Parents: f234c49
Author: wang sheng <[email protected]>
Authored: Sun Jun 28 03:00:56 2015 +0800
Committer: wang sheng <[email protected]>
Committed: Sun Jun 28 03:00:56 2015 +0800
----------------------------------------------------------------------
bin/singa-cleanup.sh | 43 ------------------------
bin/singa-run.sh | 80 ++++++++++++++++++++++++++++++++++++---------
bin/singa-stop.sh | 67 +++++++++++++++++++++++++++++++++++++
bin/zk-service.sh | 28 +++++++---------
src/trainer/trainer.cc | 2 +-
5 files changed, 145 insertions(+), 75 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/903e0362/bin/singa-cleanup.sh
----------------------------------------------------------------------
diff --git a/bin/singa-cleanup.sh b/bin/singa-cleanup.sh
deleted file mode 100755
index f94c9db..0000000
--- a/bin/singa-cleanup.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env bash
-#
-#/**
-# * Copyright 2015 The Apache Software Foundation
-# *
-# * Licensed to the Apache Software Foundation (ASF) under one
-# * or more contributor license agreements. See the NOTICE file
-# * distributed with this work for additional information
-# * regarding copyright ownership. The ASF licenses this file
-# * to you under the Apache License, Version 2.0 (the
-# * "License"); you may not use this file except in compliance
-# * with the License. You may obtain a copy of the License at
-# *
-# * http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-#
-# Manage a zookeeper service
-#
-
-usage="Usage: singa-cleanup.sh"
-
-#if [ $# -le 0 ]; then
-# echo $usage
-# exit 1
-#fi
-
-BIN=`dirname "${BASH_SOURCE-$0}"`
-BIN=`cd "$BIN">/dev/null; pwd`
-BASE=`cd "$BIN/..">/dev/null; pwd`
-ZKDATADIR="/tmp/zookeeper"
-
-. $BIN/zk-service.sh stop 2>/dev/null
-
-echo cleanning data in zookeeper...
-#remove zk data
-rm -r $ZKDATADIR
-
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/903e0362/bin/singa-run.sh
----------------------------------------------------------------------
diff --git a/bin/singa-run.sh b/bin/singa-run.sh
index c911ec3..b69fe5d 100755
--- a/bin/singa-run.sh
+++ b/bin/singa-run.sh
@@ -23,34 +23,84 @@
# Run a Singa job
#
-usage="Usage: singa-run.sh"
+usage="Usage: \n \
+ (single node): singa-run.sh -cluster=YOUR_CONF_FILE -model=YOUR_CONF_FILE \n
\
+ (distributed): singa-run.sh -conf=YOUR_CONF_DIR \
+ (the directory should contain cluster.conf/model.conf/hostfile)"
-#if [ $# -le 0 ]; then
-# echo $usage
+#if [ $# -le 0 ] || [ $# -ge 3 ] ; then
+# echo -e $usage
# exit 1
#fi
+valid_args=false
+
+if [ $# = 1 ] ; then
+ if [[ $1 = "-conf="* ]] ; then
+ valid_args=true
+ conf_path=${1:6}
+ host_path=$conf_path/hostfile
+ fi
+elif [ $# = 2 ] ; then
+ if [[ $1 = "-cluster="* ]] && [[ $2 = "-model="* ]] ; then
+ valid_args=true
+ elif [[ $2 = "-cluster="* ]] && [[ $1 = "-model="* ]] ; then
+ valid_args=true
+ fi
+fi
+
+if [ $valid_args = false ] ; then
+ echo -e $usage
+ exit 1
+fi
+
+# get singa-base
BIN=`dirname "${BASH_SOURCE-$0}"`
BIN=`cd "$BIN">/dev/null; pwd`
BASE=`cd "$BIN/..">/dev/null; pwd`
cd $BASE
-#cleanup singa data
-. $BIN/singa-cleanup.sh
+# clenup singa data
+if [ -z $host_path ] ; then
+ . $BIN/singa-stop.sh
+else
+ . $BIN/singa-stop.sh $host_path
+fi
-#start zookeeper
+# start zookeeper
. $BIN/zk-service.sh start 2>/dev/null
-#wait for zk service to be up
+# wait for zk service to be up
sleep 3
-#run singa
-cmd="./singa "$@
-echo starting singa ...
-echo executing: $cmd
-exec $cmd
+# check mode
+if [ $# = 2 ] ; then
+ # start singa process
+ cmd="./singa "$@
+ echo starting singa ...
+ echo executing : $cmd
+ exec $cmd
+elif [ $# = 1 ] ; then
+ # ssh and start singa processes
+ ssh_options="-oStrictHostKeyChecking=no \
+ -oUserKnownHostsFile=/dev/null \
+ -oLogLevel=quiet"
+ hosts=(`cat $host_path |cut -d ' ' -f 1`)
+ for i in ${hosts[@]} ; do
+ cmd="cd $BASE; \
+ ./singa \
+ -cluster=$conf_path/cluster.conf \
+ -model=$conf_path/model.conf"
+ echo executing @ $i : $cmd
+ ssh $ssh_options $i $cmd &
+ done
+ wait
+fi
-#stop zookeeper
-echo stopping singa ...
-. $BIN/zk-service.sh stop 2>/dev/null
+# cleanup singa data
+if [ -z $host_path ] ; then
+ . $BIN/singa-stop.sh
+else
+ . $BIN/singa-stop.sh $host_path
+fi
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/903e0362/bin/singa-stop.sh
----------------------------------------------------------------------
diff --git a/bin/singa-stop.sh b/bin/singa-stop.sh
new file mode 100755
index 0000000..acded75
--- /dev/null
+++ b/bin/singa-stop.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+#
+#/**
+# * Copyright 2015 The Apache Software Foundation
+# *
+# * Licensed to the Apache Software Foundation (ASF) under one
+# * or more contributor license agreements. See the NOTICE file
+# * distributed with this work for additional information
+# * regarding copyright ownership. The ASF licenses this file
+# * to you under the Apache License, Version 2.0 (the
+# * "License"); you may not use this file except in compliance
+# * with the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+#
+# Clean up singa processes and zookeeper metadata
+#
+
+usage="Usage: \n \
+ (local process): singa-stop.sh \n \
+ (distributed): singa-stop.sh HOST_FILE"
+
+if [ $# -gt 1 ]; then
+ echo -e $usage
+ exit 1
+fi
+
+BIN=`dirname "${BASH_SOURCE-$0}"`
+BIN=`cd "$BIN">/dev/null; pwd`
+BASE=`cd "$BIN/..">/dev/null; pwd`
+ZKDATA_DIR="/tmp/zookeeper"
+
+PROC_NAME="lt-singa"
+HOST_FILE=$1
+
+
+# kill singa processes
+if [ $# = 0 ] ; then
+ echo kill singa @ localhost ...
+ cmd="killall -s SIGKILL "$PROC_NAME
+ exec $cmd
+elif [ $# = 1 ] ; then
+ ssh_options="-oStrictHostKeyChecking=no \
+ -oUserKnownHostsFile=/dev/null \
+ -oLogLevel=quiet"
+ hosts=(`cat $HOST_FILE |cut -d ' ' -f 1`)
+ for i in ${hosts[@]} ; do
+ cmd="killall -s SIGKILL "$PROC_NAME
+ echo kill singa @ $i ...
+ ssh $ssh_options $i $cmd
+ done
+fi
+
+# close zookeeper
+. $BIN/zk-service.sh stop 2>/dev/null
+
+echo cleanning metadata in zookeeper ...
+# remove zk data
+rm -r $ZKDATA_DIR
+
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/903e0362/bin/zk-service.sh
----------------------------------------------------------------------
diff --git a/bin/zk-service.sh b/bin/zk-service.sh
index fc2c99c..1b67c7c 100755
--- a/bin/zk-service.sh
+++ b/bin/zk-service.sh
@@ -23,7 +23,7 @@
# Manage a zookeeper service
#
-usage="Usage: zk-service.sh (start|stop)"
+usage="Usage: zk-service.sh [start|stop]"
if [ $# -le 0 ]; then
echo $usage
@@ -35,16 +35,12 @@ BIN=`cd "$BIN">/dev/null; pwd`
BASE=`cd "$BIN/..">/dev/null; pwd`
ZKBASE=$BASE/thirdparty/zookeeper-3.4.6
-#echo $ZKBASE
-
-if [ "$SINGA_MANAGES_ZK" = "" ]; then
+if [ -z $SINGA_MANAGES_ZK ]; then
SINGA_MANAGES_ZK=true
fi
-#echo 'SINGA_MANAGES_ZK='$SINGA_MANAGES_ZK
-
-if [ "$SINGA_MANAGES_ZK" = "true" ]; then
- #check zookeeper installation
+if [ $SINGA_MANAGES_ZK = true ]; then
+ # check zookeeper installation
if [ ! -d $ZKBASE ]; then
echo "zookeeper not found, please install zookeeper first:"
echo "$./SINGA_BASE/thirdparty/install.sh zookeeper"
@@ -52,28 +48,28 @@ if [ "$SINGA_MANAGES_ZK" = "true" ]; then
fi
fi
-#get argument
+# get argument
cmd=$1
case $cmd in
(start)
- #start zk service
- if [ "$SINGA_MANAGES_ZK" = "true" ]; then
- #check zoo,cfg
+ # start zk service
+ if [ $SINGA_MANAGES_ZK = true ]; then
+ # check zoo,cfg
if [ ! -f $ZKBASE/conf/zoo.cfg ]; then
echo "zoo.cfg not found, create from sample.cfg"
cp $ZKBASE/conf/zoo_sample.cfg $ZKBASE/conf/zoo.cfg
fi
- #echo 'starting zookeeper service...'
+ # echo 'starting zookeeper service...'
$ZKBASE/bin/zkServer.sh start
fi
;;
(stop)
- #stop zk service
- if [ "$SINGA_MANAGES_ZK" = "true" ]; then
- #echo 'stopping zookeeper service...'
+ # stop zk service
+ if [ $SINGA_MANAGES_ZK = true ]; then
+ # echo 'stopping zookeeper service...'
$ZKBASE/bin/zkServer.sh stop
fi
;;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/903e0362/src/trainer/trainer.cc
----------------------------------------------------------------------
diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc
index c12ff84..ce135cc 100644
--- a/src/trainer/trainer.cc
+++ b/src/trainer/trainer.cc
@@ -359,7 +359,7 @@ void Trainer::Run(const vector<shared_ptr<Worker>>& workers,
break;
}
}else if(type==kMetric){
- if(msg->src_first()>=0){
+ if(msg->src_first()==0){
int step=msg->trgt_first();
string prefix((char*)msg->frame_data(), msg->frame_size());
msg->next_frame();