SINGA-8 Implement distributed Hogwild handle zookeeper disconnection. change zookeeper log level to ERROR.
close #14 Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/c51f9264 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/c51f9264 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/c51f9264 Branch: refs/heads/master Commit: c51f9264bb26f22cc7e49ee85ea2fc30c322cc9f Parents: 4956d6a Author: wang sheng <[email protected]> Authored: Thu Jun 25 21:40:38 2015 +0800 Committer: wang sheng <[email protected]> Committed: Thu Jun 25 21:40:38 2015 +0800 ---------------------------------------------------------------------- src/utils/cluster_rt.cc | 13 +++++++++---- src/utils/param.cc | 9 ++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c51f9264/src/utils/cluster_rt.cc ---------------------------------------------------------------------- diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc index 61911fd..6143567 100644 --- a/src/utils/cluster_rt.cc +++ b/src/utils/cluster_rt.cc @@ -28,7 +28,7 @@ ZKClusterRT::~ZKClusterRT() { char zk_cxt[] = "ZKClusterRT"; bool ZKClusterRT::Init() { - zoo_set_debug_level(ZOO_LOG_LEVEL_WARN); + zoo_set_debug_level(ZOO_LOG_LEVEL_ERROR); zkhandle_ = zookeeper_init(host_.c_str(), WatcherGlobal, timeout_, 0, static_cast<void *>(zk_cxt), 0); if (zkhandle_ == NULL) { @@ -176,9 +176,14 @@ bool ZKClusterRT::CreateZKNode(const char* path, const char* val, int flag, for (int i = 0; i < kNumRetry; ++i) { ret = zoo_create(zkhandle_, path, val, val == nullptr ? -1 : strlen(val), &ZOO_OPEN_ACL_UNSAFE, flag, buf, kMaxBufLen); - if (ret != ZNONODE) break; - LOG(WARNING) << "zookeeper parent node of " << path - << " not exist, retry later"; + if (ret == ZNONODE) { + LOG(WARNING) << "zookeeper parent node of " << path + << " not exist, retry later"; + } else if (ret == ZCONNECTIONLOSS) { + LOG(WARNING) << "zookeeper disconnected, retry later"; + } else { + break; + } sleep(kSleepSec); } // copy the node name ot output http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c51f9264/src/utils/param.cc ---------------------------------------------------------------------- diff --git a/src/utils/param.cc b/src/utils/param.cc index 1e05ab9..07ad8ce 100644 --- a/src/utils/param.cc +++ b/src/utils/param.cc @@ -168,7 +168,8 @@ Msg* Param::HandlePutMsg(Msg** msg){ } Msg* Param::HandleGetMsg(Msg** msg){ - char copy; sscanf(static_cast<char*>((*msg)->frame_data()), " %d ", ©); + int copy; + sscanf(static_cast<char*>((*msg)->frame_data()), " %d ", ©); (*msg)->next_frame(); if(copy) (*msg)->add_frame(mutable_cpu_data(), sizeof(float)*size()); @@ -179,7 +180,8 @@ Msg* Param::HandleGetMsg(Msg** msg){ } int Param::ParseUpdateMsg(Msg** msg){ - char copy; sscanf(static_cast<char*>((*msg)->frame_data()), " %d ", ©); + int copy; + sscanf(static_cast<char*>((*msg)->frame_data()), " %d ", ©); (*msg)->next_frame(); if(copy){ LOG(ERROR)<<"Copy in parse update"; @@ -231,7 +233,8 @@ int Param::ParseUpdateResponseMsg(Msg **msg, int slice_idx){ } void Param::ParseResponseMsg(Msg** msg, int slice_idx){ - char copy; sscanf(static_cast<char*>((*msg)->frame_data()), " %d ", ©); + int copy; + sscanf(static_cast<char*>((*msg)->frame_data()), " %d ", ©); (*msg)->next_frame(); if(copy){ CHECK((*msg)->frame_size());
