Repository: incubator-singa
Updated Branches:
refs/heads/master 7a61a687c -> 2c7edd73c
SINGA-43 Remove Job-related output from workspace
singa-run script now only take a job.conf as input (instead of workspace)
users are required to set a workspace in their job.conf
all job information are recorded in the log_dir, which is set in singa.conf
the dir structure is as follows:
log_dir/job-info/job-ID-YYYYmmdd-HHMMSS/job.hosts host list
/job.pids pid list
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/2c7edd73
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/2c7edd73
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/2c7edd73
Branch: refs/heads/master
Commit: 2c7edd73c0636f2085247ea29300fc1913ee05d5
Parents: 7a61a68
Author: wang sheng <[email protected]>
Authored: Thu Aug 13 11:25:12 2015 +0800
Committer: wang sheng <[email protected]>
Committed: Thu Aug 13 12:43:38 2015 +0800
----------------------------------------------------------------------
bin/singa-cleanup.sh | 2 +-
bin/singa-console.sh | 14 ++--
bin/singa-env.sh | 13 ++++
bin/singa-run.sh | 46 +++++++-----
bin/singa-stop.sh | 2 +-
examples/cifar10/job.conf | 1 +
include/utils/cluster_rt.h | 4 +-
src/main.cc | 6 +-
src/proto/job.proto | 2 +-
src/proto/singa.proto | 4 +-
src/utils/cluster_rt.cc | 7 +-
src/utils/tool.cc | 159 ++++++++++++++++++++++++----------------
12 files changed, 156 insertions(+), 104 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/bin/singa-cleanup.sh
----------------------------------------------------------------------
diff --git a/bin/singa-cleanup.sh b/bin/singa-cleanup.sh
index 9b542c6..c987ca4 100755
--- a/bin/singa-cleanup.sh
+++ b/bin/singa-cleanup.sh
@@ -31,5 +31,5 @@ $SINGA_BIN/singa-stop.sh || exit 1
# close zookeeper
if [ $SINGA_MANAGES_ZK = true ]; then
- $SINGA_BIN/zk-service.sh stop
+ $SINGA_BIN/zk-service.sh stop || exit 1
fi
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/bin/singa-console.sh
----------------------------------------------------------------------
diff --git a/bin/singa-console.sh b/bin/singa-console.sh
index 8f7cac1..36913ce 100755
--- a/bin/singa-console.sh
+++ b/bin/singa-console.sh
@@ -23,10 +23,10 @@
# console to list/view/kill singa jobs
#
-usage="Usage:\n
- # singa-console.sh list : list running singa jobs\n
- # singa-console.sh view JOB_ID : view procs of a singa job\n
- # singa-console.sh kill JOB_ID : kill a singa job"
+usage="Usage: singa-console.sh <command> <args>\n
+ list : list running singa jobs\n
+ view JOB_ID : view procs of a singa job\n
+ kill JOB_ID : kill a singa job"
if [ $# == 0 ]; then
echo -e $usage
@@ -59,12 +59,11 @@ case $1 in
echo -e $usage
exit 1
fi
- host_file="job-$2.tmp"
- ./singatool view $2 1>$host_file || exit 1
+ hosts=`./singatool view "$2"`
+ [ $? == 0 ] || exit 1
ssh_options="-oStrictHostKeyChecking=no \
-oUserKnownHostsFile=/dev/null \
-oLogLevel=quiet"
- hosts=`cat $host_file | cut -d ' ' -f 1`
if [ `head -1 "$SINGA_CONF"/hostfile` == localhost ]; then
local_procs=1
fi
@@ -79,7 +78,6 @@ case $1 in
$singa_kill
fi
done
- rm $host_file
./singatool clean $2 || exit 1
;;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/bin/singa-env.sh
----------------------------------------------------------------------
diff --git a/bin/singa-env.sh b/bin/singa-env.sh
index 10578b8..c9d42bd 100755
--- a/bin/singa-env.sh
+++ b/bin/singa-env.sh
@@ -24,10 +24,14 @@
# * SINGA_HOME
# * SINGA_BIN
# * SINGA_CONF
+# * SINGA_LOG
# * ZK_HOME
# * SINGA_MANAGES_ZK
#
+# exit if varaiables already set
+[ -z $SINGA_ENV_DONE ] || exit 0
+
# set SINGA_BIN
if [ -z $SINGA_BIN ]; then
SINGA_BIN=`dirname "${BASH_SOURCE-$0}"`
@@ -44,6 +48,13 @@ if [ -z $SINGA_CONF ]; then
SINGA_CONF=$SINGA_HOME/conf
fi
+# set SINGA_LOG
+if [ -z $SINGA_LOG ]; then
+ # add -global arg, so no need to run under SINGA_HOME
+ SINGA_LOG=`"$SINGA_HOME"/singatool getlogdir
-global="$SINGA_CONF"/singa.conf`
+ [ $? == 0 ] || exit 1
+fi
+
# set ZK_HOME
if [ -z $ZK_HOME ]; then
ZK_HOME=$SINGA_HOME/thirdparty/zookeeper-3.4.6
@@ -55,3 +66,5 @@ if [ -z $SINGA_MANAGES_ZK ]; then
SINGA_MANAGES_ZK=false
fi
+# mark that we have done all
+SINGA_ENV_DONE=1
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/bin/singa-run.sh
----------------------------------------------------------------------
diff --git a/bin/singa-run.sh b/bin/singa-run.sh
index d434331..aa65fd9 100755
--- a/bin/singa-run.sh
+++ b/bin/singa-run.sh
@@ -23,8 +23,7 @@
# run a Singa job
#
-usage="Usage: singa-run.sh -workspace=YOUR_WORKSPACE [ --resume ]\n
- # workspace should contain job.conf\n
+usage="Usage: singa-run.sh -conf=JOB_CONF [ --resume ]\n
# set --resume if want to recover a job\n
### NOTICE ###\n
# if you are using model.conf + cluster.conf,\n
@@ -33,28 +32,30 @@ usage="Usage: singa-run.sh -workspace=YOUR_WORKSPACE [
--resume ]\n
# check arguments
while [ $# != 0 ]; do
- if [[ $1 == "-workspace="* ]]; then
- workspace=$1
+ if [[ $1 == "-conf="* ]]; then
+ conf=$1
elif [ $1 == "--resume" ]; then
resume=1
else
- echo -e $usage
- exit 1
+ echo -e $usage && exit 1
fi
shift
done
-if [ -z $workspace ]; then
+if [ -z $conf ]; then
echo -e $usage
exit 1
fi
# get environment variables
. `dirname "${BASH_SOURCE-$0}"`/singa-env.sh
-# get workspace path
-workspace=`cd "${workspace:11}">/dev/null; pwd`
-job_conf=$workspace/job.conf
+
+# change conf to an absolute path
+conf_dir=`dirname "${conf:6}"`
+conf_dir=`cd "$conf_dir">/dev/null; pwd`
+conf_base=`basename "${conf:6}"`
+job_conf=$conf_dir/$conf_base
if [ ! -f $job_conf ]; then
- echo job.conf not exists in $workspace
+ echo $job_conf not exists
exit 1
fi
cd $SINGA_HOME
@@ -64,20 +65,26 @@ if [ $SINGA_MANAGES_ZK = true ]; then
$SINGA_BIN/zk-service.sh start || exit 1
fi
+# generate unique job id
+job_id=`./singatool create`
+[ $? == 0 ] || exit 1
+echo Unique JOB_ID is $job_id
+
+# generate job info dir
+# format: job-JOB_ID-YYYYMMDD-HHMMSS
+log_dir=$SINGA_LOG/job-info/job-$job_id-$(date '+%Y%m%d-%H%M%S');
+mkdir -p $log_dir
+echo Record job information to $log_dir
+
# generate host file
-host_file=$workspace/job.hosts
+host_file=$log_dir/job.hosts
python $SINGA_HOME/tool/gen_hosts.py -conf=$job_conf \
-hosts=$SINGA_CONF/hostfile \
-output=$host_file \
|| exit 1
-# generate unique job id
-./singatool create 1>$workspace/job.id || exit 1
-job_id=`cat $workspace/job.id`
-echo Generate job id to $workspace/job.id [job_id = $job_id]
-
# set command to run singa
-singa_run="./singa -workspace=$workspace -job=$job_id"
+singa_run="./singa -conf=$job_conf -job=$job_id"
if [ ! -z $resume ]; then
singa_run="$singa_run --resume"
fi
@@ -100,6 +107,5 @@ done
# generate pid list for this job
sleep 2
-./singatool view $job_id 1>$workspace/job.pids || exit
-echo Generate pid list to $workspace/job.pids
+./singatool view $job_id 1>$log_dir/job.pids || exit 1
wait
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/bin/singa-stop.sh
----------------------------------------------------------------------
diff --git a/bin/singa-stop.sh b/bin/singa-stop.sh
index ff67f32..115f3fb 100755
--- a/bin/singa-stop.sh
+++ b/bin/singa-stop.sh
@@ -33,7 +33,7 @@ ssh_options="-oStrictHostKeyChecking=no \
-oUserKnownHostsFile=/dev/null \
-oLogLevel=quiet"
hosts=`cat $host_file | cut -d ' ' -f 1`
-singa_kill="killall -s SIGKILL -r singa"
+singa_kill="killall -q -s SIGKILL -r singa"
for i in ${hosts[@]}; do
echo Kill singa @ $i ...
if [ $i == localhost ]; then
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/examples/cifar10/job.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf
index 2541330..f7829b8 100644
--- a/examples/cifar10/job.conf
+++ b/examples/cifar10/job.conf
@@ -1,6 +1,7 @@
cluster {
nworker_groups: 1
nserver_groups: 1
+ workspace: "examples/cifar10"
}
model {
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/include/utils/cluster_rt.h
----------------------------------------------------------------------
diff --git a/include/utils/cluster_rt.h b/include/utils/cluster_rt.h
index 5738ae3..90f60cd 100644
--- a/include/utils/cluster_rt.h
+++ b/include/utils/cluster_rt.h
@@ -93,7 +93,7 @@ class ZKService {
RTCallback *cb);
private:
- const int kNumRetry = 10;
+ const int kNumRetry = 5;
const int kSleepSec = 1;
static void WatcherGlobal(zhandle_t* zh, int type, int state,
@@ -139,7 +139,7 @@ class JobManager {
JobManager(const std::string& host, int timeout);
bool Init();
- int GenerateJobID();
+ bool GenerateJobID(int* id);
bool ListJobs(std::vector<JobInfo>* jobs);
bool ListJobProcs(int job, std::vector<std::string>* procs);
bool Clean(int job);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/src/main.cc
----------------------------------------------------------------------
diff --git a/src/main.cc b/src/main.cc
index 87ab384..d95e405 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -13,7 +13,7 @@
DEFINE_int32(job, -1, "Unique job ID generated from singa-run.sh");
DEFINE_bool(resume, false, "Resume from checkpoint passed at cmd line");
-DEFINE_string(workspace, "./workspace", "workspace passed at cmd line");
+DEFINE_string(conf, "./job.conf", "job conf passed at cmd line");
/**
* Register layers, and other customizable classes.
@@ -31,12 +31,10 @@ int main(int argc, char **argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
singa::JobProto jobConf;
- std::string job_file = FLAGS_workspace + "/job.conf";
+ std::string job_file = FLAGS_conf;
singa::ReadProtoFromTextFile(job_file.c_str(), &jobConf);
CHECK(jobConf.has_cluster());
CHECK(jobConf.has_model());
- if (!jobConf.cluster().has_workspace())
- jobConf.mutable_cluster()->set_workspace(FLAGS_workspace);
RegisterClasses();
singa::SubmitJob(FLAGS_job, FLAGS_resume, jobConf);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index 3b22470..eacf7e0 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -19,7 +19,7 @@ message ClusterProto {
// port number is used by ZeroMQ
optional int32 start_port = 13 [default = 6723];
// local workspace, train/val/test shards, checkpoint files
- optional string workspace = 14 [default = "workspace"];
+ required string workspace = 14;
// conduct updates at server side; otherwise do it at worker side
optional bool server_update = 40 [default = true];
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/src/proto/singa.proto
----------------------------------------------------------------------
diff --git a/src/proto/singa.proto b/src/proto/singa.proto
index 94af58d..193c8b7 100644
--- a/src/proto/singa.proto
+++ b/src/proto/singa.proto
@@ -3,6 +3,6 @@ package singa;
message SingaProto {
// ip/hostname:port[,ip/hostname:port]
required string zookeeper_host = 1;
- // if not set, use the default dir of glog
- optional string log_dir = 2;
+ // log dir for singa binary and job information(job id, host list, pid list)
+ optional string log_dir = 2 [default = "/tmp/singa-log/"];
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/src/utils/cluster_rt.cc
----------------------------------------------------------------------
diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc
index 0458b12..cd11bbd 100644
--- a/src/utils/cluster_rt.cc
+++ b/src/utils/cluster_rt.cc
@@ -297,14 +297,15 @@ bool JobManager::Init() {
return true;
}
-int JobManager::GenerateJobID() {
+bool JobManager::GenerateJobID(int* id) {
char buf[kZKBufSize];
string lock = kZKPathJLock + "/lock-";
if (!zk_.CreateNode(lock.c_str(), nullptr,
ZOO_EPHEMERAL | ZOO_SEQUENCE, buf)) {
- return -1;
+ return false;
}
- return atoi(buf+strlen(buf)-10);
+ *id = atoi(buf+strlen(buf)-10);
+ return true;
}
bool JobManager::ListJobProcs(int job, vector<string>* procs) {
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2c7edd73/src/utils/tool.cc
----------------------------------------------------------------------
diff --git a/src/utils/tool.cc b/src/utils/tool.cc
index 60da414..c450b72 100644
--- a/src/utils/tool.cc
+++ b/src/utils/tool.cc
@@ -11,74 +11,109 @@ namespace gflags = google;
DEFINE_string(global, "conf/singa.conf", "Global config file");
+singa::SingaProto global;
+const int SUCCESS = 0;
+const int ARG_ERR = 1;
+const int RUN_ERR = 2;
+
+// generate a unique job id
+int create() {
+ singa::JobManager mngr(global.zookeeper_host());
+ if (!mngr.Init()) return RUN_ERR;
+ int id;
+ if (!mngr.GenerateJobID(&id)) return RUN_ERR;
+ printf("%d\n", id);
+ return SUCCESS;
+}
+
+// list singa jobs (running or all)
+int list(bool all) {
+ singa::JobManager mngr(global.zookeeper_host());
+ if (!mngr.Init()) return RUN_ERR;
+ std::vector<singa::JobInfo> jobs;
+ if (!mngr.ListJobs(&jobs)) return RUN_ERR;
+ printf("JOB ID |NUM PROCS \n");
+ printf("----------|-----------\n");
+ for (singa::JobInfo job : jobs) {
+ if (!job.procs && !all) continue;
+ printf("%-10d|%-10d\n", job.id, job.procs);
+ }
+ return SUCCESS;
+}
+
+// view procs of a singa job
+int view(int id) {
+ singa::JobManager mngr(global.zookeeper_host());
+ if (!mngr.Init()) return RUN_ERR;
+ std::vector<std::string> procs;
+ if (!mngr.ListJobProcs(id, &procs)) return RUN_ERR;
+ for (std::string s : procs) {
+ printf("%s\n", s.c_str());
+ }
+ return SUCCESS;
+}
+
+// clean a job path in zookeeper
+int clean(int id) {
+ singa::JobManager mngr(global.zookeeper_host());
+ if (!mngr.Init()) return RUN_ERR;
+ if (!mngr.Clean(id)) return RUN_ERR;
+ return SUCCESS;
+}
+
+// clean all singa data in zookeeper
+int cleanup() {
+ singa::JobManager mngr(global.zookeeper_host());
+ if (!mngr.Init()) return RUN_ERR;
+ if (!mngr.Cleanup()) return RUN_ERR;
+ return SUCCESS;
+}
+
+// show log dir in global config
+int getlogdir() {
+ std::string dir = global.log_dir();
+ while (dir.length() > 1 && dir[dir.length()-1] == '/') dir.pop_back();
+ printf("%s\n", dir.c_str());
+ return SUCCESS;
+}
+
int main(int argc, char **argv) {
- google::InitGoogleLogging(argv[0]);
+ std::string usage = "usage: singatool <command> <args>\n"
+ " getlogdir : show log dir in global config\n"
+ " create : generate a unique job id\n"
+ " list : list running singa jobs\n"
+ " listall : list all singa jobs\n"
+ " view JOB_ID : view procs of a singa job\n"
+ " clean JOB_ID : clean a job path in zookeeper\n"
+ " cleanup : clean all singa data in zookeeper\n";
// set logging level to ERROR and log to STDERR
FLAGS_logtostderr = 1;
FLAGS_minloglevel = 2;
+ google::InitGoogleLogging(argv[0]);
gflags::ParseCommandLineFlags(&argc, &argv, true);
-
- singa::SingaProto global;
singa::ReadProtoFromTextFile(FLAGS_global.c_str(), &global);
- LOG(INFO) << "The global config is \n" << global.DebugString();
- singa::JobManager mng(global.zookeeper_host());
- std::string usage = "singatool usage:\n"
- "# ./singatool create : generate a unique job id\n"
- "# ./singatool list : list running singa jobs\n"
- "# ./singatool view JOB_ID : view procs of a singa job\n"
- "# ./singatool clean JOB_ID : clean a job path in zookeeper\n"
- "# ./singatool cleanup : clean all singa data in zookeeper\n"
- "# ./singatool listall : list all singa jobs\n";
- if (argc <= 1) {
- LOG(ERROR) << usage;
- return 1;
+ // stat code: ARG_ERR for wrong argument, RUN_ERR for runtime error
+ int stat = SUCCESS;
+ if (argc <= 1) stat = ARG_ERR;
+ else {
+ if (!strcmp(argv[1], "create"))
+ stat = create();
+ else if (!strcmp(argv[1], "list"))
+ stat = list(false);
+ else if (!strcmp(argv[1], "listall"))
+ stat = list(true);
+ else if (!strcmp(argv[1], "view"))
+ stat = (argc > 2) ? view(atoi(argv[2])) : ARG_ERR;
+ else if (!strcmp(argv[1], "clean"))
+ stat = (argc > 2) ? clean(atoi(argv[2])) : ARG_ERR;
+ else if (!strcmp(argv[1], "cleanup"))
+ stat = cleanup();
+ else if (!strcmp(argv[1], "getlogdir"))
+ stat = getlogdir();
+ else stat = ARG_ERR;
}
- if (!mng.Init()) return 1;
- if (!strcmp(argv[1], "create")) {
- int id = mng.GenerateJobID();
- printf("%d\n", id);
- } else if (!strcmp(argv[1], "list")) {
- std::vector<singa::JobInfo> jobs;
- if (!mng.ListJobs(&jobs)) return 1;
- printf("JOB ID |NUM PROCS \n");
- printf("----------|-----------\n");
- for (singa::JobInfo job : jobs) {
- if (!job.procs) continue;
- printf("%-10d|%-10d\n", job.id, job.procs);
- }
- } else if (!strcmp(argv[1], "listall")) {
- std::vector<singa::JobInfo> jobs;
- if (!mng.ListJobs(&jobs)) return 1;
- printf("JOB ID |NUM PROCS \n");
- printf("----------|-----------\n");
- for (singa::JobInfo job : jobs) {
- printf("%-10d|%-10d\n", job.id, job.procs);
- }
- } else if (!strcmp(argv[1], "view")) {
- if (argc <= 2) {
- LOG(ERROR) << usage;
- return 1;
- }
- int id = atoi(argv[2]);
- std::vector<std::string> procs;
- if (!mng.ListJobProcs(id, &procs)) return 1;
- for (std::string s : procs) {
- printf("%s\n", s.c_str());
- }
- } else if (!strcmp(argv[1], "clean")) {
- if (argc <= 2) {
- LOG(ERROR) << usage;
- return 1;
- }
- int id = atoi(argv[2]);
- if (!mng.Clean(id)) return 1;
- } else if (!strcmp(argv[1], "cleanup")) {
- if (!mng.Cleanup()) return 1;
- } else {
- LOG(ERROR) << usage;
- return 1;
- }
-
- return 0;
+
+ if (stat == ARG_ERR) LOG(ERROR) << usage;
+ return stat;
}