add retry for worker connecting to zk, when its subscribed server group is not up yet
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/96121bae Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/96121bae Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/96121bae Branch: refs/heads/master Commit: 96121bae15b4fe2dbd8541f5daf47ab91d080bb7 Parents: cd9fc79 Author: wangsheng <[email protected]> Authored: Tue May 26 16:52:39 2015 +0800 Committer: wangsheng <[email protected]> Committed: Tue May 26 16:52:39 2015 +0800 ---------------------------------------------------------------------- Makefile.example | 2 +- include/utils/cluster.h | 13 ------------- include/utils/cluster_rt.h | 2 ++ src/utils/cluster_rt.cc | 33 +++++++++++++++++++++------------ 4 files changed, 24 insertions(+), 26 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/Makefile.example ---------------------------------------------------------------------- diff --git a/Makefile.example b/Makefile.example index 582e8d7..6d8d83a 100644 --- a/Makefile.example +++ b/Makefile.example @@ -51,7 +51,7 @@ OBJS := $(sort $(SINGA_OBJS) $(TEST_OBJS) ) .PHONY: singa test singa: $(PROTO_OBJS) $(SINGA_OBJS) - $(CXX) $(SINGA_OBJS) src/main.cc -o $(BUILD_DIR)/singa $(CXXFLAGS) $(LDFLAGS) + $(CXX) $(SINGA_OBJS) src/main.cc -o singa $(CXXFLAGS) $(LDFLAGS) @echo loader: proto $(LOADER_OBJS) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/include/utils/cluster.h ---------------------------------------------------------------------- diff --git a/include/utils/cluster.h b/include/utils/cluster.h index 563045d..d7ac365 100644 --- a/include/utils/cluster.h +++ b/include/utils/cluster.h @@ -109,19 +109,6 @@ class Cluster { } */ - //ClusterRuntime functions - bool server_watch(int gid, int sid) const { - return false; - } - - bool worker_join_sgroup(int gid, int wid, int server_group) const { - return false; - } - - bool worker_leave_sgroup(int gid, int wid, int s_group) const { - return false; - } - shared_ptr<ClusterRuntime> runtime() const { return cluster_rt_; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/include/utils/cluster_rt.h ---------------------------------------------------------------------- diff --git a/include/utils/cluster_rt.h b/include/utils/cluster_rt.h index 1430119..54a13c5 100644 --- a/include/utils/cluster_rt.h +++ b/include/utils/cluster_rt.h @@ -74,6 +74,8 @@ class ZKClusterRT : public ClusterRuntime{ vector<RTCallback *> cb_vec_; const int MAX_BUF_LEN = 50; + const int RETRY_NUM = 10; + const int SLEEP_SEC = 1; }; } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/96121bae/src/utils/cluster_rt.cc ---------------------------------------------------------------------- diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc index 6a12ca9..b97fadc 100644 --- a/src/utils/cluster_rt.cc +++ b/src/utils/cluster_rt.cc @@ -81,21 +81,30 @@ bool ZKClusterRT::wJoinSGroup(int gid, int wid, int s_group){ string path = getSGroupPath(s_group) + getWorkerPath(gid, wid); char buf[MAX_BUF_LEN]; - int ret = zoo_create(zkhandle_, path.c_str(), NULL, -1, &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL, buf, MAX_BUF_LEN); - if (ret == ZOK){ - LOG(INFO) << "zookeeper node " << buf << " created"; - return true; - } - else if (ret == ZNODEEXISTS){ - LOG(WARNING) << "zookeeper node " << path << " already exist"; - return true; - } - else if (ret == ZNONODE){ - LOG(ERROR) << "zookeeper parent node " << getSGroupPath(s_group) << " not exist"; + //try to create a file under the server group path + for (int i = 0; i < RETRY_NUM; ++i){ + //send the zk request + int ret = zoo_create(zkhandle_, path.c_str(), NULL, -1, &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL, buf, MAX_BUF_LEN); + + if (ret == ZOK){ + LOG(INFO) << "zookeeper node " << buf << " created"; + return true; + } + else if (ret == ZNODEEXISTS){ + LOG(WARNING) << "zookeeper node " << path << " already exist"; + return true; + } + //the parent node is not on, need to wait + else if (ret == ZNONODE){ + LOG(WARNING) << "zookeeper parent node " << getSGroupPath(s_group) << " not exist, retry later"; + sleep(SLEEP_SEC); + } + + LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_create)"; return false; } - LOG(ERROR) << "Unhandled ZK error code: " << ret << " (zoo_create)"; + LOG(ERROR) << "zookeeper parent node " << getSGroupPath(s_group) << "still not exist after " << RETRY_NUM << " tries"; return false; }
