Repository: incubator-singa Updated Branches: refs/heads/master 5bf1c9280 -> 29de86337
SINGA-38 Support concurrent jobs clean bash scripts in /bin * scripts change input relative path to absolute path * scripts catch erros when running external scripts/binaries * add singa-env.sh to generate environment variables Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/f746b995 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/f746b995 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/f746b995 Branch: refs/heads/master Commit: f746b995ecffe4abe2f0667dbf0c10624ce00780 Parents: 5bf1c92 Author: wang sheng <[email protected]> Authored: Sun Jul 19 05:31:32 2015 +0800 Committer: wang sheng <[email protected]> Committed: Tue Jul 21 22:16:35 2015 +0800 ---------------------------------------------------------------------- bin/singa-cleanup.sh | 35 +++++++++++++ bin/singa-env.sh | 57 +++++++++++++++++++++ bin/singa-run.sh | 119 ++++++++++++++++--------------------------- bin/singa-stop.sh | 70 +++++++++++-------------- bin/zk-service.sh | 72 +++++++++++++------------- src/trainer/worker.cc | 2 +- src/utils/cluster_rt.cc | 2 +- tool/gen_hosts.py | 23 +++++---- 8 files changed, 214 insertions(+), 166 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f746b995/bin/singa-cleanup.sh ---------------------------------------------------------------------- diff --git a/bin/singa-cleanup.sh b/bin/singa-cleanup.sh new file mode 100755 index 0000000..9b542c6 --- /dev/null +++ b/bin/singa-cleanup.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# +#/** +# * Copyright 2015 The Apache Software Foundation +# * +# * Licensed to the Apache Software Foundation (ASF) under one +# * or more contributor license agreements. See the NOTICE file +# * distributed with this work for additional information +# * regarding copyright ownership. The ASF licenses this file +# * to you under the Apache License, Version 2.0 (the +# * "License"); you may not use this file except in compliance +# * with the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# +# clean up singa processes and zookeeper metadata +# + +# get environment variables +. `dirname "${BASH_SOURCE-$0}"`/singa-env.sh + +# clean singa jobs and data +$SINGA_BIN/singa-stop.sh || exit 1 + +# close zookeeper +if [ $SINGA_MANAGES_ZK = true ]; then + $SINGA_BIN/zk-service.sh stop +fi http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f746b995/bin/singa-env.sh ---------------------------------------------------------------------- diff --git a/bin/singa-env.sh b/bin/singa-env.sh new file mode 100755 index 0000000..10578b8 --- /dev/null +++ b/bin/singa-env.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# +#/** +# * Copyright 2015 The Apache Software Foundation +# * +# * Licensed to the Apache Software Foundation (ASF) under one +# * or more contributor license agreements. See the NOTICE file +# * distributed with this work for additional information +# * regarding copyright ownership. The ASF licenses this file +# * to you under the Apache License, Version 2.0 (the +# * "License"); you may not use this file except in compliance +# * with the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# +# set Singa environment variables, includes: +# * SINGA_HOME +# * SINGA_BIN +# * SINGA_CONF +# * ZK_HOME +# * SINGA_MANAGES_ZK +# + +# set SINGA_BIN +if [ -z $SINGA_BIN ]; then + SINGA_BIN=`dirname "${BASH_SOURCE-$0}"` + SINGA_BIN=`cd "$SINGA_BIN">/dev/null; pwd` +fi + +# set SINGA_HOME +if [ -z $SINGA_HOME ]; then + SINGA_HOME=`cd "$SINGA_BIN/..">/dev/null; pwd` +fi + +# set SINGA_CONF +if [ -z $SINGA_CONF ]; then + SINGA_CONF=$SINGA_HOME/conf +fi + +# set ZK_HOME +if [ -z $ZK_HOME ]; then + ZK_HOME=$SINGA_HOME/thirdparty/zookeeper-3.4.6 + SINGA_MANAGES_ZK=true +fi + +# set SINGA_MANAGES_ZK +if [ -z $SINGA_MANAGES_ZK ]; then + SINGA_MANAGES_ZK=false +fi + http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f746b995/bin/singa-run.sh ---------------------------------------------------------------------- diff --git a/bin/singa-run.sh b/bin/singa-run.sh index 0a8c9f6..2c282c3 100755 --- a/bin/singa-run.sh +++ b/bin/singa-run.sh @@ -20,88 +20,59 @@ # * limitations under the License. # */ # -# Run a Singa job +# run a Singa job # -usage="Usage: \n \ - (single node): singa-run.sh -cluster=YOUR_CONF_FILE -model=YOUR_CONF_FILE \n \ - (distributed): singa-run.sh -conf=YOUR_CONF_DIR \ - (the directory should contain cluster.conf/model.conf/hostfile)" +usage="Usage: singa-run.sh -conf=CONF_DIR + (CONF_DIR should contain cluster.conf && model.conf)" +# usage="Usage: \n +# (single process): singa-run.sh -cluster=YOUR_CONF_FILE -model=YOUR_CONF_FILE \n +# (multi-process): singa-run.sh -conf=YOUR_CONF_DIR +# (the directory should contain cluster.conf && model.conf)" -#if [ $# -le 0 ] || [ $# -ge 3 ] ; then -# echo -e $usage -# exit 1 -#fi - -valid_args=false - -if [ $# = 1 ] ; then - if [[ $1 = "-conf="* ]] ; then - valid_args=true - conf_path=${1:6} - host_path=$conf_path/job.hosts - fi -elif [ $# = 2 ] ; then - if [[ $1 = "-cluster="* ]] && [[ $2 = "-model="* ]] ; then - valid_args=true - elif [[ $2 = "-cluster="* ]] && [[ $1 = "-model="* ]] ; then - valid_args=true - fi -fi - -if [ $valid_args = false ] ; then - echo -e $usage +# check arguments +if [ $# != 1 ] || [[ $1 != "-conf="* ]]; then + echo $usage exit 1 fi -# get singa-base -BIN=`dirname "${BASH_SOURCE-$0}"` -BIN=`cd "$BIN">/dev/null; pwd` -BASE=`cd "$BIN/..">/dev/null; pwd` - -cd $BASE +# get environment variables +. `dirname "${BASH_SOURCE-$0}"`/singa-env.sh +# get workspace path +workspace=`cd "${1:6}">/dev/null; pwd` # start zookeeper -$BIN/zk-service.sh start 2>/dev/null +if [ $SINGA_MANAGES_ZK = true ]; then + $SINGA_BIN/zk-service.sh start || exit 1 +fi -# wait for zk service to be up -sleep 3 +# cleanup old processes and data +$SINGA_BIN/singa-stop.sh || exit 1 -# clenup singa data -$BIN/singa-stop.sh conf/hostfile +# generate host file +host_file=$workspace/job.hosts +python $SINGA_HOME/tool/gen_hosts.py -conf=$workspace/cluster.conf \ + -hosts=$SINGA_CONF/hostfile \ + -output=$host_file \ + || exit 1 -# check mode -if [ $# = 2 ] ; then - # start single singa process - cmd="./singa "$@ - echo starting singa ... - echo executing : $cmd - $cmd -elif [ $# = 1 ] ; then - # start multiple singa processes - # generate host file - cmd=" python tool/gen_hosts.py -conf=$conf_path/cluster.conf \ - -src=conf/hostfile -dst=$host_path" - echo $cmd - $cmd - # ssh and start singa processes - ssh_options="-oStrictHostKeyChecking=no \ - -oUserKnownHostsFile=/dev/null \ - -oLogLevel=quiet" - hosts=(`cat $host_path |cut -d ' ' -f 1`) - cmd="./singa -cluster=$conf_path/cluster.conf -model=$conf_path/model.conf" - ssh_cmd="cd $BASE; "$cmd - for i in ${hosts[@]} ; do - if [ $i = localhost ] ; then - echo executing : $cmd - $cmd & - else - echo executing @ $i : $ssh_cmd - ssh $ssh_options $i $ssh_cmd & - fi - done - wait -fi - -# cleanup singa data -#$BIN/singa-stop.sh conf/hostfile +# ssh and start singa processes +ssh_options="-oStrictHostKeyChecking=no \ +-oUserKnownHostsFile=/dev/null \ +-oLogLevel=quiet" +hosts=`cat $host_file |cut -d ' ' -f 1` +# cd to SINGA_HOME as it need conf/singa.conf +cd $SINGA_HOME +singa_run="./singa -cluster=$workspace/cluster.conf -model=$workspace/model.conf" +singa_sshrun="cd $SINGA_HOME; ./singa -cluster=$workspace/cluster.conf \ + -model=$workspace/model.conf" +for i in ${hosts[@]} ; do + if [ $i = localhost ] ; then + echo executing : $singa_run + $singa_run & + else + echo executing @ $i : $singa_sshrun + ssh $ssh_options $i $singa_sshrun & + fi +done +wait http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f746b995/bin/singa-stop.sh ---------------------------------------------------------------------- diff --git a/bin/singa-stop.sh b/bin/singa-stop.sh index 0e7170b..e3f24dc 100755 --- a/bin/singa-stop.sh +++ b/bin/singa-stop.sh @@ -20,53 +20,41 @@ # * limitations under the License. # */ # -# Clean up singa processes and zookeeper metadata +# clean up singa processes and zookeeper metadata # -usage="Usage: \n \ - (local process): singa-stop.sh \n \ - (distributed): singa-stop.sh HOST_FILE" - -if [ $# -gt 1 ]; then - echo -e $usage - exit 1 -fi - -BIN=`dirname "${BASH_SOURCE-$0}"` -BIN=`cd "$BIN">/dev/null; pwd` -BASE=`cd "$BIN/..">/dev/null; pwd` -ZKDATA_DIR="/tmp/zookeeper" - -PROC_NAME="singa" -HOST_FILE=$1 +# usage="Usage: \n \ +# (local process): singa-stop.sh \n \ +# (distributed): singa-stop.sh HOST_FILE" +# +# if [ $# -gt 1 ]; then +# echo -e $usage +# exit 1 +# fi -cd $BASE +# get environment variables +. `dirname "${BASH_SOURCE-$0}"`/singa-env.sh # kill singa processes -if [ $# = 0 ] ; then - echo kill singa @ localhost ... - cmd="killall -s SIGKILL "$PROC_NAME - $cmd -elif [ $# = 1 ] ; then - ssh_options="-oStrictHostKeyChecking=no \ - -oUserKnownHostsFile=/dev/null \ - -oLogLevel=quiet" - hosts=(`cat $HOST_FILE |cut -d ' ' -f 1`) - for i in ${hosts[@]} ; do - cmd="killall -s SIGKILL -r "$PROC_NAME - echo kill singa @ $i ... - if [ $i == localhost ] ; then - $cmd - else - ssh $ssh_options $i $cmd - fi - done -fi - +host_file=$SINGA_CONF/hostfile +ssh_options="-oStrictHostKeyChecking=no \ + -oUserKnownHostsFile=/dev/null \ + -oLogLevel=quiet" +hosts=`cat $host_file |cut -d ' ' -f 1` +singa_kill="killall -s SIGKILL -r singa" +for i in ${hosts[@]}; do + echo kill singa @ $i ... + if [ $i == localhost ]; then + $singa_kill + else + ssh $ssh_options $i $singa_kill + fi +done # wait for killall command sleep 2 -echo cleanning metadata in zookeeper ... # remove zk data -./singatool - +# singatool need global conf under SINGA_HOME +echo cleanning metadata in zookeeper ... +cd $SINGA_HOME +./singatool || exit 1 http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f746b995/bin/zk-service.sh ---------------------------------------------------------------------- diff --git a/bin/zk-service.sh b/bin/zk-service.sh index 1b67c7c..f9d3823 100755 --- a/bin/zk-service.sh +++ b/bin/zk-service.sh @@ -20,59 +20,55 @@ # * limitations under the License. # */ # -# Manage a zookeeper service +# manage ZooKeeper service # usage="Usage: zk-service.sh [start|stop]" -if [ $# -le 0 ]; then +if [ $# != 1 ]; then echo $usage exit 1 fi -BIN=`dirname "${BASH_SOURCE-$0}"` -BIN=`cd "$BIN">/dev/null; pwd` -BASE=`cd "$BIN/..">/dev/null; pwd` -ZKBASE=$BASE/thirdparty/zookeeper-3.4.6 +# get environment variables +. `dirname "${BASH_SOURCE-$0}"`/singa-env.sh -if [ -z $SINGA_MANAGES_ZK ]; then - SINGA_MANAGES_ZK=true +# check if singa manages zookeeper service +if [ $SINGA_MANAGES_ZK != true ]; then + echo "Singa does not manage a valid zookeeper service (SINGA_MANAGES_ZK != true)" + exit 1 fi -if [ $SINGA_MANAGES_ZK = true ]; then - # check zookeeper installation - if [ ! -d $ZKBASE ]; then - echo "zookeeper not found, please install zookeeper first:" - echo "$./SINGA_BASE/thirdparty/install.sh zookeeper" - exit 1 - fi +# check zookeeper installation +if [ ! -d $ZK_HOME ]; then + echo "zookeeper not found at $ZK_HOME" + echo "if you do not have zookeeper service, please install:" + echo " $SINGA_HOME/thirdparty/install.sh zookeeper" + echo "otherwise, please set ZK_HOME correctly" + exit 1 fi -# get argument -cmd=$1 - -case $cmd in - -(start) - # start zk service - if [ $SINGA_MANAGES_ZK = true ]; then - # check zoo,cfg - if [ ! -f $ZKBASE/conf/zoo.cfg ]; then +# get command +case $1 in + start) + # start zk service + # check zoo.cfg + if [ ! -f $ZK_HOME/conf/zoo.cfg ]; then echo "zoo.cfg not found, create from sample.cfg" - cp $ZKBASE/conf/zoo_sample.cfg $ZKBASE/conf/zoo.cfg + cp $ZK_HOME/conf/zoo_sample.cfg $ZK_HOME/conf/zoo.cfg fi - # echo 'starting zookeeper service...' - $ZKBASE/bin/zkServer.sh start - fi - ;; - -(stop) - # stop zk service - if [ $SINGA_MANAGES_ZK = true ]; then - # echo 'stopping zookeeper service...' - $ZKBASE/bin/zkServer.sh stop - fi - ;; + # cd to SINGA_HOME as zookeeper.out will be here + cd $SINGA_HOME + $ZK_HOME/bin/zkServer.sh start 2>/dev/null + ;; + stop) + # stop zk service + $ZK_HOME/bin/zkServer.sh stop 2>/dev/null + ;; + + *) + echo $usage + exit 1 esac http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f746b995/src/trainer/worker.cc ---------------------------------------------------------------------- diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc index b6ce0d5..7d779ad 100644 --- a/src/trainer/worker.cc +++ b/src/trainer/worker.cc @@ -183,7 +183,7 @@ void Worker::Run() { msg->set_type(kStop); dealer_->Send(&msg); // use param dealer to send the stop msg - LOG(ERROR) << "Worker (group = " <<grp_id_ << ", id = " << id_ << ") stop"; + LOG(ERROR) << "Worker (group = " <<grp_id_ << ", id = " << id_ << ") stops"; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f746b995/src/utils/cluster_rt.cc ---------------------------------------------------------------------- diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc index 408adde..7e5a86b 100644 --- a/src/utils/cluster_rt.cc +++ b/src/utils/cluster_rt.cc @@ -106,7 +106,7 @@ bool ZKService::Exist(const char* path) { int ret = zoo_exists(zkhandle_, path, 0, &stat); if (ret == ZOK) return true; else if (ret == ZNONODE) return false; - LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_exists)"; + //LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_exists)"; return false; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f746b995/tool/gen_hosts.py ---------------------------------------------------------------------- diff --git a/tool/gen_hosts.py b/tool/gen_hosts.py index 570eff9..a3bec47 100644 --- a/tool/gen_hosts.py +++ b/tool/gen_hosts.py @@ -9,8 +9,8 @@ from pb2.cluster_pb2 import ClusterProto # parse command line parser = argparse.ArgumentParser(description='Generate host list from host file for a SINGA job') parser.add_argument('-conf', dest='conf', metavar='CONF_FILE', required=True, help='cluster.conf file') -parser.add_argument('-src', dest='src', metavar='SRC_FILE', required=True, help='global host file') -parser.add_argument('-dst', dest='dst', metavar='DST_FILE', required=True, help='generated list') +parser.add_argument('-hosts', dest='hosts', metavar='HOST_FILE', required=True, help='global host file') +parser.add_argument('-output', dest='output', metavar='OUTPUT_FILE', required=True, help='generated list') args = parser.parse_args(); # read from .conf file @@ -27,21 +27,22 @@ else: fd_conf.close() # read from source host file -fd_src = open(args.src, 'r') +fd_hosts = open(args.hosts, 'r') hosts = [] -for line in fd_src: +for line in fd_hosts: line = line.strip() if len(line) == 0 or line[0] == '#': continue hosts.append(line) -fd_src.close() +fd_hosts.close() -# write to dst file +# write to output file num_hosts = len(hosts) if (num_hosts == 0): - print 'ERROR: source host file is empty' - sys.exit() -fd_dst = open(args.dst, 'w') + print "contains no valid host %s" % args.hosts + sys.exit(1) +fd_output = open(args.output, 'w') for i in range(nprocs): - fd_dst.write(hosts[i % num_hosts] + '\n') -fd_dst.close() + fd_output.write(hosts[i % num_hosts] + '\n') +fd_output.close() +print 'generate host list at %s' % args.output
