Repository: incubator-singa Updated Branches: refs/heads/master 921f9277f -> 6bcaaaa4d
fix bugs for early server termination -- clean singa meta in zookeeper before running a singa job Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6bcaaaa4 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6bcaaaa4 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6bcaaaa4 Branch: refs/heads/master Commit: 6bcaaaa4dc873bd54284e69720f358b540b2d370 Parents: 921f927 Author: wang sheng <[email protected]> Authored: Thu May 28 15:04:08 2015 +0800 Committer: wang sheng <[email protected]> Committed: Thu May 28 15:04:08 2015 +0800 ---------------------------------------------------------------------- README.md | 2 +- bin/singa-cleanup.sh | 8 +++++++- bin/singa-run.sh | 21 +++++++++++---------- src/utils/cluster_rt.cc | 33 +++++++++++++++++++++++---------- 4 files changed, 42 insertions(+), 22 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bcaaaa4/README.md ---------------------------------------------------------------------- diff --git a/README.md b/README.md index 66a2964..5effdb8 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ SINGA is developed and tested on Linux platforms with the following external lib * czmq version >= 3 - * zookeeper version >= 3.4.6 + * zookeeper version 3.4.6 Tips: For libraries like openblas, opencv, older versions may also work, because we do not use any newly added features. http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bcaaaa4/bin/singa-cleanup.sh ---------------------------------------------------------------------- diff --git a/bin/singa-cleanup.sh b/bin/singa-cleanup.sh index c38d1e6..f94c9db 100755 --- a/bin/singa-cleanup.sh +++ b/bin/singa-cleanup.sh @@ -33,5 +33,11 @@ usage="Usage: singa-cleanup.sh" BIN=`dirname "${BASH_SOURCE-$0}"` BIN=`cd "$BIN">/dev/null; pwd` BASE=`cd "$BIN/..">/dev/null; pwd` +ZKDATADIR="/tmp/zookeeper" + +. $BIN/zk-service.sh stop 2>/dev/null + +echo cleanning data in zookeeper... +#remove zk data +rm -r $ZKDATADIR -$BIN/zk-service.sh stop http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bcaaaa4/bin/singa-run.sh ---------------------------------------------------------------------- diff --git a/bin/singa-run.sh b/bin/singa-run.sh index 3ee50a3..c911ec3 100755 --- a/bin/singa-run.sh +++ b/bin/singa-run.sh @@ -34,22 +34,23 @@ BIN=`dirname "${BASH_SOURCE-$0}"` BIN=`cd "$BIN">/dev/null; pwd` BASE=`cd "$BIN/..">/dev/null; pwd` -#get argument -cmd=$1 - cd $BASE -$BIN/zk-service.sh start +#cleanup singa data +. $BIN/singa-cleanup.sh + +#start zookeeper +. $BIN/zk-service.sh start 2>/dev/null #wait for zk service to be up sleep 3 +#run singa +cmd="./singa "$@ echo starting singa ... +echo executing: $cmd +exec $cmd -echo "./singa" $@ -#. ./singa $@ -. ./singa $@ - +#stop zookeeper echo stopping singa ... - -$BIN/zk-service.sh stop +. $BIN/zk-service.sh stop 2>/dev/null http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bcaaaa4/src/utils/cluster_rt.cc ---------------------------------------------------------------------- diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc index fe9850f..b60b334 100644 --- a/src/utils/cluster_rt.cc +++ b/src/utils/cluster_rt.cc @@ -157,17 +157,30 @@ void ZKClusterRT::childChanges(zhandle_t *zh, int type, int state, const char *p RTCallback *cb = (RTCallback *)watcherCtx; if (cb->fn == nullptr) return; - struct String_vector child; - //check the child list and put another watcher - int ret = zoo_wget_children(zh, path, childChanges, watcherCtx, &child); - LOG(INFO) << "ret = " << ret; - if (ret == ZOK){ - LOG(INFO) << "child.count = " << child.count; - if (child.count == 0){ - //all workers leave, we do callback now - (*cb->fn)(cb->ctx); - cb->fn = nullptr; + if (type == ZOO_CHILD_EVENT){ + struct String_vector child; + //check the child list and put another watcher + int ret = zoo_wget_children(zh, path, childChanges, watcherCtx, &child); + LOG(INFO) << "ret = " << ret; + if (ret == ZOK){ + LOG(INFO) << "child.count = " << child.count; + if (child.count == 0){ + //LOG(ERROR) << "do call back"; + //LOG(ERROR) << "type = " << type; + //LOG(ERROR) << "state = " << state; + //LOG(ERROR) << "path = " << path; + + //all workers leave, we do callback now + (*cb->fn)(cb->ctx); + cb->fn = nullptr; + } } + else{ + LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_wget_children)"; + } + } + else{ + LOG(ERROR) << "Unhandled callback type code: "<< type; } }
