Repository: incubator-singa Updated Branches: refs/heads/master aad22fcd5 -> 71d10fbe5
SINGA-79 Fix bug in singatool that can not parse -conf flag singa-env.sh - change commandline arg from -confdir=XXX to -confdir XXX tool.cc - parse -confdir flag to read corresponding sing configure cluster_rt.h/cc - add detailed documentation for JobManager - add checks for zk related operations if zk handler is not initilized. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/59001114 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/59001114 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/59001114 Branch: refs/heads/master Commit: 59001114a6290b3d05b77bcf4bd438396fb0702a Parents: aad22fc Author: wang sheng <[email protected]> Authored: Fri Oct 2 13:35:41 2015 +0800 Committer: wang sheng <[email protected]> Committed: Fri Oct 2 14:44:18 2015 +0800 ---------------------------------------------------------------------- bin/singa-env.sh | 2 +- conf/singa.conf | 2 ++ include/utils/cluster_rt.h | 13 ++++++++++++- src/utils/cluster_rt.cc | 14 +++++++++++--- src/utils/tool.cc | 15 ++++++++++++--- 5 files changed, 38 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/59001114/bin/singa-env.sh ---------------------------------------------------------------------- diff --git a/bin/singa-env.sh b/bin/singa-env.sh index 98a0abc..0a3db9e 100755 --- a/bin/singa-env.sh +++ b/bin/singa-env.sh @@ -51,7 +51,7 @@ fi # set SINGA_LOG if [ -z $SINGA_LOG ]; then # add -confdir arg, so no need to run under SINGA_HOME - SINGA_LOG=`"$SINGA_HOME"/singatool getlogdir -confdir="$SINGA_CONF"` + SINGA_LOG=`"$SINGA_HOME"/singatool getlogdir -confdir "$SINGA_CONF"` [ $? == 0 ] || exit 1 fi http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/59001114/conf/singa.conf ---------------------------------------------------------------------- diff --git a/conf/singa.conf b/conf/singa.conf index fad37d5..20cff98 100644 --- a/conf/singa.conf +++ b/conf/singa.conf @@ -1,4 +1,6 @@ # point to your active zookeeper service +# this is comma separated host:port pairs, each corresponding to a zk server +# e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002" zookeeper_host: "localhost:2181" # set if you want to change log directory http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/59001114/include/utils/cluster_rt.h ---------------------------------------------------------------------- diff --git a/include/utils/cluster_rt.h b/include/utils/cluster_rt.h index bdfa8fd..5de6c16 100644 --- a/include/utils/cluster_rt.h +++ b/include/utils/cluster_rt.h @@ -151,16 +151,27 @@ class ClusterRuntime { class JobManager { public: + // host is comma separated host:port pairs, each corresponding to a zk server. + // e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002" explicit JobManager(const std::string& host); JobManager(const std::string& host, int timeout); + // NOTICE: Init must be called once, before start to use other functions bool Init(); + // generate a unique job id bool GenerateJobID(int* id); - bool GenerateHostList(const char* job_file, std::vector<std::string>* list); + // generate a list of hosts for a job conf + bool GenerateHostList(const char* host_file, const char* job_file, + std::vector<std::string>* list); + // list all jobs recorded in zk bool ListJobs(std::vector<JobInfo>* jobs); + // list running processes for a job bool ListJobProcs(int job, std::vector<std::string>* procs); + // remove a job path in zk bool Remove(int job); + // remove all job paths in zk bool RemoveAllJobs(); + // remove all singa related paths in zk bool CleanUp(); private: http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/59001114/src/utils/cluster_rt.cc ---------------------------------------------------------------------- diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc index e51ac97..493e1e2 100644 --- a/src/utils/cluster_rt.cc +++ b/src/utils/cluster_rt.cc @@ -83,6 +83,7 @@ bool ZKService::Init(const string& host, int timeout) { bool ZKService::CreateNode(const char* path, const char* val, int flag, char* output) { + CHECK(zkhandle_) << "zk handler not initialized"; char buf[kZKBufSize]; int ret = 0; // send the zk request @@ -126,6 +127,7 @@ bool ZKService::CreateNode(const char* path, const char* val, int flag, } bool ZKService::DeleteNode(const char* path) { + CHECK(zkhandle_) << "zk handler not initialized"; int ret = zoo_delete(zkhandle_, path, -1); if (ret == ZOK) { LOG(INFO) << "deleted zookeeper node " << path; @@ -140,6 +142,7 @@ bool ZKService::DeleteNode(const char* path) { } bool ZKService::Exist(const char* path) { + CHECK(zkhandle_) << "zk handler not initialized"; struct Stat stat; int ret = zoo_exists(zkhandle_, path, 0, &stat); if (ret == ZOK) return true; @@ -149,6 +152,7 @@ bool ZKService::Exist(const char* path) { } bool ZKService::UpdateNode(const char* path, const char* val) { + CHECK(zkhandle_) << "zk handler not initialized"; // set version = -1, do not check content version int ret = zoo_set(zkhandle_, path, val, strlen(val), -1); if (ret == ZOK) { @@ -163,6 +167,7 @@ bool ZKService::UpdateNode(const char* path, const char* val) { } bool ZKService::GetNode(const char* path, char* output) { + CHECK(zkhandle_) << "zk handler not initialized"; struct Stat stat; int val_len = kZKBufSize; int ret = zoo_get(zkhandle_, path, 0, output, &val_len, &stat); @@ -179,6 +184,7 @@ bool ZKService::GetNode(const char* path, char* output) { } bool ZKService::GetChild(const char* path, vector<string>* vt) { + CHECK(zkhandle_) << "zk handler not initialized"; struct String_vector child; int ret = zoo_get_children(zkhandle_, path, 0, &child); if (ret == ZOK) { @@ -193,6 +199,7 @@ bool ZKService::GetChild(const char* path, vector<string>* vt) { bool ZKService::WGetChild(const char* path, vector<string>* vt, RTCallback *cb) { + CHECK(zkhandle_) << "zk handler not initialized"; struct String_vector child; int ret = zoo_wget_children(zkhandle_, path, ChildChanges, cb, &child); if (ret == ZOK) { @@ -358,7 +365,8 @@ bool JobManager::GenerateJobID(int* id) { return true; } -bool JobManager::GenerateHostList(const char* job_file, vector<string>* list) { +bool JobManager::GenerateHostList(const char* host_file, const char* job_file, + vector<string>* list) { // compute required #process from job conf ClusterProto cluster; google::protobuf::TextFormat::ParseFromString(ExtractClusterConf(job_file), @@ -373,9 +381,9 @@ bool JobManager::GenerateHostList(const char* job_file, vector<string>* list) { else nprocs = std::max(nworker_procs, nserver_procs); // get available host list from global conf - std::ifstream hostfile("conf/hostfile"); + std::ifstream hostfile(host_file); if (!hostfile.is_open()) { - LOG(FATAL) << "Cannot open file: " << "conf/hostfile"; + LOG(FATAL) << "Cannot open file: " << host_file; } vector<string> hosts; string host; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/59001114/src/utils/tool.cc ---------------------------------------------------------------------- diff --git a/src/utils/tool.cc b/src/utils/tool.cc index 435129c..3a6563c 100644 --- a/src/utils/tool.cc +++ b/src/utils/tool.cc @@ -27,6 +27,7 @@ #include "utils/cluster_rt.h" #include "utils/common.h" +std::string conf_dir; singa::SingaProto global; const int SUCCESS = 0; const int ARG_ERR = 1; @@ -55,7 +56,8 @@ int genhost(char* job_conf) { singa::JobManager mngr(global.zookeeper_host()); if (!mngr.Init()) return RUN_ERR; std::vector<std::string> list; - if (!mngr.GenerateHostList(job_conf, &list)) return RUN_ERR; + if (!mngr.GenerateHostList((conf_dir+"/hostfile").c_str(), job_conf, &list)) + return RUN_ERR; // output selected hosts for (std::string host : list) printf("%s\n", host.c_str()); @@ -123,12 +125,19 @@ int main(int argc, char **argv) { " view <job id> : view procs of a singa job\n" " remove <job id> : remove a job path in zookeeper\n" " removeall : remova all job paths in zookeeper\n" - " cleanup : clean all singa data in zookeeper\n"; + " cleanup : clean all singa data in zookeeper\n" + "[optional arguments] NOTICE: must put at end of a command\n" + " -confdir <dir> : path to singa global conf dir"; + // set logging level to ERROR and log to STDERR only google::LogToStderr(); google::SetStderrLogging(google::ERROR); google::InitGoogleLogging(argv[0]); - singa::ReadProtoFromTextFile("conf/singa.conf", &global); + // parse -confdir argument + int arg_pos = singa::ArgPos(argc, argv, "-confdir"); + conf_dir = arg_pos == -1 ? "conf" : argv[arg_pos+1]; + if (arg_pos != -1) argc -= 2; + singa::ReadProtoFromTextFile((conf_dir+"/singa.conf").c_str(), &global); // stat code: ARG_ERR for wrong argument, RUN_ERR for runtime error int stat = (argc <= 1) ? ARG_ERR : SUCCESS;
