* increase timeout in etcd3 plugin
* reduce retry frequency on KV store. This can overload the
KV store even more when the KV store is already timing out
* when 'watch key' returns, try to use the value returned instead of
unnecessarily reading it again
---
src/osaf/consensus/consensus.cc | 53 ++++++++++++++++---------
src/osaf/consensus/consensus.h | 7 +++-
src/osaf/consensus/plugins/etcd3.plugin | 2 +-
src/rde/rded/rde_cb.h | 1 +
src/rde/rded/rde_main.cc | 7 +++-
src/rde/rded/role.cc | 5 +++
6 files changed, 51 insertions(+), 24 deletions(-)
diff --git a/src/osaf/consensus/consensus.cc b/src/osaf/consensus/consensus.cc
index ac5573585..800b776e6 100644
--- a/src/osaf/consensus/consensus.cc
+++ b/src/osaf/consensus/consensus.cc
@@ -432,24 +432,17 @@ SaAisErrorT Consensus::WriteTakeoverResult(
return rc;
}
-SaAisErrorT Consensus::ReadTakeoverRequest(std::vector<std::string>& tokens) {
+SaAisErrorT Consensus::ParseTakeoverRequest(const std::string& request,
+ std::vector<std::string>& tokens) {
TRACE_ENTER();
- std::string request;
- SaAisErrorT rc;
-
- rc = KeyValue::Get(kTakeoverRequestKeyname, request);
- if (rc != SA_AIS_OK) {
- // it doesn't always exist, don't log an error
- TRACE("Could not read takeover request (%d)", rc);
- return SA_AIS_ERR_FAILED_OPERATION;
- }
-
if (request.empty() == true) {
// on node shutdown, this could be empty
return SA_AIS_ERR_UNAVAILABLE;
}
+ TRACE("Found '%s'", request.c_str());
+
tokens.clear();
Split(request, tokens);
if (tokens.size() != 4) {
@@ -457,12 +450,28 @@ SaAisErrorT
Consensus::ReadTakeoverRequest(std::vector<std::string>& tokens) {
return SA_AIS_ERR_LIBRARY;
}
- TRACE("Found '%s'", request.c_str());
return SA_AIS_OK;
}
+SaAisErrorT Consensus::ReadTakeoverRequest(std::vector<std::string>& tokens) {
+ TRACE_ENTER();
+
+ std::string request;
+ SaAisErrorT rc;
+
+ rc = KeyValue::Get(kTakeoverRequestKeyname, request);
+ if (rc != SA_AIS_OK) {
+ // it doesn't always exist, don't log an error
+ TRACE("Could not read takeover request (%d)", rc);
+ return SA_AIS_ERR_FAILED_OPERATION;
+ }
+
+ return ParseTakeoverRequest(request, tokens);
+}
+
Consensus::TakeoverState Consensus::HandleTakeoverRequest(
- const uint64_t cluster_size) {
+ const uint64_t cluster_size,
+ const std::string& request) {
TRACE_ENTER();
if (use_consensus_ == false) {
@@ -470,15 +479,21 @@ Consensus::TakeoverState Consensus::HandleTakeoverRequest(
}
SaAisErrorT rc;
- uint32_t retries = 0;
std::vector<std::string> tokens;
- // get request from KV store
- rc = ReadTakeoverRequest(tokens);
- while (rc == SA_AIS_ERR_FAILED_OPERATION && retries < kMaxRetry) {
- ++retries;
- std::this_thread::sleep_for(kSleepInterval);
+ if (request.empty() == true) {
+ LOG_NO("Empty takeover request from watch command. Read it again.");
+ // if the plugin did not return a value with 'watch',
+ // read request from KV store
+ uint32_t retries = 0;
rc = ReadTakeoverRequest(tokens);
+ while (rc == SA_AIS_ERR_FAILED_OPERATION && retries < kMaxRetry) {
+ ++retries;
+ std::this_thread::sleep_for(kSleepInterval);
+ rc = ReadTakeoverRequest(tokens);
+ }
+ } else {
+ rc = ParseTakeoverRequest(request, tokens);
}
if (rc != SA_AIS_OK) {
diff --git a/src/osaf/consensus/consensus.h b/src/osaf/consensus/consensus.h
index 865078349..fbcbb2037 100644
--- a/src/osaf/consensus/consensus.h
+++ b/src/osaf/consensus/consensus.h
@@ -76,14 +76,15 @@ class Consensus {
const std::string TakeoverStateStr[4] = {"UNDEFINED", "NEW", "ACCEPTED",
"REJECTED"};
- TakeoverState HandleTakeoverRequest(const uint64_t cluster_size);
+ TakeoverState HandleTakeoverRequest(const uint64_t cluster_size,
+ const std::string& request);
private:
bool use_consensus_ = false;
bool use_remote_fencing_ = false;
const std::string kTestKeyname = "opensaf_write_test";
const std::chrono::milliseconds kSleepInterval =
- std::chrono::milliseconds(500); // in ms
+ std::chrono::milliseconds(1000); // in ms
static constexpr uint32_t kLockTimeout = 0; // lock is persistent by default
static constexpr uint32_t kMaxTakeoverRetry = 20;
static constexpr uint32_t kMaxRetry = 30;
@@ -95,6 +96,8 @@ class Consensus {
const std::string& proposed_owner,
const uint64_t cluster_size);
+ SaAisErrorT ParseTakeoverRequest(const std::string& request,
+ std::vector<std::string>& tokens);
SaAisErrorT ReadTakeoverRequest(std::vector<std::string>& tokens);
SaAisErrorT WriteTakeoverResult(const std::string& current_owner,
diff --git a/src/osaf/consensus/plugins/etcd3.plugin
b/src/osaf/consensus/plugins/etcd3.plugin
index 1023ea08f..75ef8bb15 100644
--- a/src/osaf/consensus/plugins/etcd3.plugin
+++ b/src/osaf/consensus/plugins/etcd3.plugin
@@ -19,7 +19,7 @@
readonly keyname="opensaf_consensus_lock"
readonly directory="/opensaf/"
readonly etcd_options=""
-readonly etcd_timeout="5s"
+readonly etcd_timeout="10s"
export ETCDCTL_API=3
diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h
index 877687341..d3f5a24a5 100644
--- a/src/rde/rded/rde_cb.h
+++ b/src/rde/rded/rde_cb.h
@@ -68,6 +68,7 @@ struct rde_msg {
NODE_ID fr_node_id;
union {
rde_peer_info peer_info;
+ char* takeover_request = nullptr;
} info;
};
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index c59aa4536..e5813e414 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -161,13 +161,16 @@ static void handle_mbx_event() {
rde_cb->monitor_takeover_req_thread_running = false;
if (role->role() == PCS_RDA_ACTIVE) {
- LOG_NO("Received takeover request. Our network size is %zu",
+ LOG_NO("Received takeover request '%s'. Our network size is %zu",
+ msg->info.takeover_request,
rde_cb->cluster_members.size());
Consensus consensus_service;
Consensus::TakeoverState state =
consensus_service.HandleTakeoverRequest(
- rde_cb->cluster_members.size());
+ rde_cb->cluster_members.size(),
+ msg->info.takeover_request);
+ delete[] msg->info.takeover_request;
if (state == Consensus::TakeoverState::ACCEPTED) {
LOG_NO("Accepted takeover request");
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index a03372413..0567fdfcf 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -54,6 +54,11 @@ void Role::MonitorCallback(const std::string& key, const
std::string& new_value,
// don't send this to the main thread straight away, as it will
// need some time to process topology changes.
msg->type = RDE_MSG_TAKEOVER_REQUEST_CALLBACK;
+ size_t len = new_value.length() + 1;
+ msg->info.takeover_request = new char[len];
+ strncpy(msg->info.takeover_request, new_value.c_str(), len);
+ LOG_NO("Sending takeover request '%s' to main thread",
+ msg->info.takeover_request);
std::this_thread::sleep_for(std::chrono::seconds(4));
} else {
msg->type = RDE_MSG_NEW_ACTIVE_CALLBACK;
--
2.17.1
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel