Hi I will push this next Monday if no one has comments.
Thanks Gary On 25/6/18, 1:10 pm, "Gary Lee" <gary....@dektech.com.au> wrote: * increase timeout in etcd3 plugin * reduce retry frequency on KV store. This can overload the KV store even more when the KV store is already timing out * when 'watch key' returns, try to use the value returned instead of unnecessarily reading it again --- src/osaf/consensus/consensus.cc | 53 ++++++++++++++++--------- src/osaf/consensus/consensus.h | 7 +++- src/osaf/consensus/plugins/etcd3.plugin | 2 +- src/rde/rded/rde_cb.h | 1 + src/rde/rded/rde_main.cc | 7 +++- src/rde/rded/role.cc | 5 +++ 6 files changed, 51 insertions(+), 24 deletions(-) diff --git a/src/osaf/consensus/consensus.cc b/src/osaf/consensus/consensus.cc index ac5573585..800b776e6 100644 --- a/src/osaf/consensus/consensus.cc +++ b/src/osaf/consensus/consensus.cc @@ -432,24 +432,17 @@ SaAisErrorT Consensus::WriteTakeoverResult( return rc; } -SaAisErrorT Consensus::ReadTakeoverRequest(std::vector<std::string>& tokens) { +SaAisErrorT Consensus::ParseTakeoverRequest(const std::string& request, + std::vector<std::string>& tokens) { TRACE_ENTER(); - std::string request; - SaAisErrorT rc; - - rc = KeyValue::Get(kTakeoverRequestKeyname, request); - if (rc != SA_AIS_OK) { - // it doesn't always exist, don't log an error - TRACE("Could not read takeover request (%d)", rc); - return SA_AIS_ERR_FAILED_OPERATION; - } - if (request.empty() == true) { // on node shutdown, this could be empty return SA_AIS_ERR_UNAVAILABLE; } + TRACE("Found '%s'", request.c_str()); + tokens.clear(); Split(request, tokens); if (tokens.size() != 4) { @@ -457,12 +450,28 @@ SaAisErrorT Consensus::ReadTakeoverRequest(std::vector<std::string>& tokens) { return SA_AIS_ERR_LIBRARY; } - TRACE("Found '%s'", request.c_str()); return SA_AIS_OK; } +SaAisErrorT Consensus::ReadTakeoverRequest(std::vector<std::string>& tokens) { + TRACE_ENTER(); + + std::string request; + SaAisErrorT rc; + + rc = KeyValue::Get(kTakeoverRequestKeyname, request); + if (rc != SA_AIS_OK) { + // it doesn't always exist, don't log an error + TRACE("Could not read takeover request (%d)", rc); + return SA_AIS_ERR_FAILED_OPERATION; + } + + return ParseTakeoverRequest(request, tokens); +} + Consensus::TakeoverState Consensus::HandleTakeoverRequest( - const uint64_t cluster_size) { + const uint64_t cluster_size, + const std::string& request) { TRACE_ENTER(); if (use_consensus_ == false) { @@ -470,15 +479,21 @@ Consensus::TakeoverState Consensus::HandleTakeoverRequest( } SaAisErrorT rc; - uint32_t retries = 0; std::vector<std::string> tokens; - // get request from KV store - rc = ReadTakeoverRequest(tokens); - while (rc == SA_AIS_ERR_FAILED_OPERATION && retries < kMaxRetry) { - ++retries; - std::this_thread::sleep_for(kSleepInterval); + if (request.empty() == true) { + LOG_NO("Empty takeover request from watch command. Read it again."); + // if the plugin did not return a value with 'watch', + // read request from KV store + uint32_t retries = 0; rc = ReadTakeoverRequest(tokens); + while (rc == SA_AIS_ERR_FAILED_OPERATION && retries < kMaxRetry) { + ++retries; + std::this_thread::sleep_for(kSleepInterval); + rc = ReadTakeoverRequest(tokens); + } + } else { + rc = ParseTakeoverRequest(request, tokens); } if (rc != SA_AIS_OK) { diff --git a/src/osaf/consensus/consensus.h b/src/osaf/consensus/consensus.h index 865078349..fbcbb2037 100644 --- a/src/osaf/consensus/consensus.h +++ b/src/osaf/consensus/consensus.h @@ -76,14 +76,15 @@ class Consensus { const std::string TakeoverStateStr[4] = {"UNDEFINED", "NEW", "ACCEPTED", "REJECTED"}; - TakeoverState HandleTakeoverRequest(const uint64_t cluster_size); + TakeoverState HandleTakeoverRequest(const uint64_t cluster_size, + const std::string& request); private: bool use_consensus_ = false; bool use_remote_fencing_ = false; const std::string kTestKeyname = "opensaf_write_test"; const std::chrono::milliseconds kSleepInterval = - std::chrono::milliseconds(500); // in ms + std::chrono::milliseconds(1000); // in ms static constexpr uint32_t kLockTimeout = 0; // lock is persistent by default static constexpr uint32_t kMaxTakeoverRetry = 20; static constexpr uint32_t kMaxRetry = 30; @@ -95,6 +96,8 @@ class Consensus { const std::string& proposed_owner, const uint64_t cluster_size); + SaAisErrorT ParseTakeoverRequest(const std::string& request, + std::vector<std::string>& tokens); SaAisErrorT ReadTakeoverRequest(std::vector<std::string>& tokens); SaAisErrorT WriteTakeoverResult(const std::string& current_owner, diff --git a/src/osaf/consensus/plugins/etcd3.plugin b/src/osaf/consensus/plugins/etcd3.plugin index 1023ea08f..75ef8bb15 100644 --- a/src/osaf/consensus/plugins/etcd3.plugin +++ b/src/osaf/consensus/plugins/etcd3.plugin @@ -19,7 +19,7 @@ readonly keyname="opensaf_consensus_lock" readonly directory="/opensaf/" readonly etcd_options="" -readonly etcd_timeout="5s" +readonly etcd_timeout="10s" export ETCDCTL_API=3 diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h index 877687341..d3f5a24a5 100644 --- a/src/rde/rded/rde_cb.h +++ b/src/rde/rded/rde_cb.h @@ -68,6 +68,7 @@ struct rde_msg { NODE_ID fr_node_id; union { rde_peer_info peer_info; + char* takeover_request = nullptr; } info; }; diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc index c59aa4536..e5813e414 100644 --- a/src/rde/rded/rde_main.cc +++ b/src/rde/rded/rde_main.cc @@ -161,13 +161,16 @@ static void handle_mbx_event() { rde_cb->monitor_takeover_req_thread_running = false; if (role->role() == PCS_RDA_ACTIVE) { - LOG_NO("Received takeover request. Our network size is %zu", + LOG_NO("Received takeover request '%s'. Our network size is %zu", + msg->info.takeover_request, rde_cb->cluster_members.size()); Consensus consensus_service; Consensus::TakeoverState state = consensus_service.HandleTakeoverRequest( - rde_cb->cluster_members.size()); + rde_cb->cluster_members.size(), + msg->info.takeover_request); + delete[] msg->info.takeover_request; if (state == Consensus::TakeoverState::ACCEPTED) { LOG_NO("Accepted takeover request"); diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc index a03372413..0567fdfcf 100644 --- a/src/rde/rded/role.cc +++ b/src/rde/rded/role.cc @@ -54,6 +54,11 @@ void Role::MonitorCallback(const std::string& key, const std::string& new_value, // don't send this to the main thread straight away, as it will // need some time to process topology changes. msg->type = RDE_MSG_TAKEOVER_REQUEST_CALLBACK; + size_t len = new_value.length() + 1; + msg->info.takeover_request = new char[len]; + strncpy(msg->info.takeover_request, new_value.c_str(), len); + LOG_NO("Sending takeover request '%s' to main thread", + msg->info.takeover_request); std::this_thread::sleep_for(std::chrono::seconds(4)); } else { msg->type = RDE_MSG_NEW_ACTIVE_CALLBACK; -- 2.17.1 ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel