rded should not automatically include itself in the cluster member list.
Instead it should rely solely on AMFND service up, so that the count
is consistent across nodes.
Also adjust some split-brain prevention related values. More time
is required to ensure we should have an accurate view of cluster
member status, especially when MDS is run over DTM/TCP.
---
src/fm/fmd/fm_rda.cc | 2 +-
src/osaf/consensus/consensus.h | 2 +-
src/rde/rded/rde_main.cc | 4 ----
src/rde/rded/role.cc | 2 +-
4 files changed, 3 insertions(+), 7 deletions(-)
diff --git a/src/fm/fmd/fm_rda.cc b/src/fm/fmd/fm_rda.cc
index af337f868..2a9f1664f 100644
--- a/src/fm/fmd/fm_rda.cc
+++ b/src/fm/fmd/fm_rda.cc
@@ -92,7 +92,7 @@ uint32_t fm_rda_set_role(FM_CB *fm_cb, PCS_RDA_ROLE role) {
// be processing MDS down events and updating cluster_size concurrently.
// We need cluster_size to be as accurate as possible, without waiting
// too long for node down events.
- std::this_thread::sleep_for(std::chrono::seconds(3));
+ std::this_thread::sleep_for(std::chrono::seconds(4));
rc = consensus_service.PromoteThisNode(true, fm_cb->cluster_size);
if (rc != SA_AIS_OK && rc != SA_AIS_ERR_EXIST) {
diff --git a/src/osaf/consensus/consensus.h b/src/osaf/consensus/consensus.h
index f6493bedc..865078349 100644
--- a/src/osaf/consensus/consensus.h
+++ b/src/osaf/consensus/consensus.h
@@ -86,7 +86,7 @@ class Consensus {
std::chrono::milliseconds(500); // in ms
static constexpr uint32_t kLockTimeout = 0; // lock is persistent by default
static constexpr uint32_t kMaxTakeoverRetry = 20;
- static constexpr uint32_t kMaxRetry = 60;
+ static constexpr uint32_t kMaxRetry = 30;
static constexpr uint32_t kTakeoverValidTime = 15; // in seconds
void CheckForExistingTakeoverRequest();
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index b395312d6..c5b4b8283 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -260,10 +260,6 @@ static int initialize_rde() {
goto init_failed;
}
- // normally populated through AMFND svc up, but always
- // insert ourselves into the set on startup.
- rde_cb->cluster_members.insert(own_node_id);
-
rc = NCSCC_RC_SUCCESS;
init_failed:
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index 1fe0febe3..1b5a6ae89 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -53,7 +53,7 @@ void Role::MonitorCallback(const std::string& key, const
std::string& new_value,
// don't send this to the main thread straight away, as it will
// need some time to process topology changes.
msg->type = RDE_MSG_TAKEOVER_REQUEST_CALLBACK;
- std::this_thread::sleep_for(std::chrono::seconds(2));
+ std::this_thread::sleep_for(std::chrono::seconds(4));
} else {
msg->type = RDE_MSG_NEW_ACTIVE_CALLBACK;
}
--
2.14.1
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel