This ticket revisit the waiting for peer info and
fix the problem of disordered peer_up and peer info
in the commit d1593b03b3c9bec292b14dde65264c261760bf46
---
src/rde/rded/rde_main.cc | 1 +
src/rde/rded/role.cc | 63 +++++++++++++++++++++++++++++++++++++++-
src/rde/rded/role.h | 7 +++++
3 files changed, 70 insertions(+), 1 deletion(-)
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index 8ed6b046e..33dd645e2 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -125,6 +125,7 @@ static void handle_mbx_event() {
}
case RDE_MSG_PEER_DOWN:
LOG_NO("Peer down on node 0x%x", msg->fr_node_id);
+ role->RemovePeer(msg->fr_node_id);
break;
case RDE_MSG_NEW_ACTIVE_CALLBACK: {
const std::string my_node = base::Conf::NodeName();
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index 3732be449..344702e63 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -196,9 +196,13 @@ Role::Role(NODE_ID own_node_id)
discover_peer_timeout_{base::GetEnv("RDE_DISCOVER_PEER_TIMEOUT",
kDefaultDiscoverPeerTimeout)},
pre_active_script_timeout_{base::GetEnv(
- "RDE_PRE_ACTIVE_SCRIPT_TIMEOUT", kDefaultPreActiveScriptTimeout)} {}
+ "RDE_PRE_ACTIVE_SCRIPT_TIMEOUT", kDefaultPreActiveScriptTimeout)},
+ received_peer_info_{true},
+ peer_info_wait_time_{},
+ peer_info_wait_timeout_ {kDefaultWaitPeerInfoTimeout} {}
timespec* Role::Poll(timespec* ts) {
+ TRACE_ENTER();
timespec* timeout = nullptr;
if (role_ == PCS_RDA_UNDEFINED) {
timespec now = base::ReadMonotonicClock();
@@ -238,6 +242,25 @@ timespec* Role::Poll(timespec* ts) {
cb->state_refresh_thread_started = true;
std::thread(&Role::RefreshConsensusState, this, cb).detach();
}
+ if (consensus_service.IsEnabled() == false) {
+ // We are already ACTIVE, and has just discovered a new node
+ // which makes the election_end_time_ reset
+ if (received_peer_info_ == false) {
+ timespec now = base::ReadMonotonicClock();
+ if (peer_info_wait_time_ >= now) {
+ *ts = peer_info_wait_time_ - now;
+ timeout = ts;
+ } else {
+ // Timeout but haven't received peer info
+ // The peer RDE could be in ACTIVE
+ // thus self-fence to avoid split-brain risk
+ LOG_ER("Discovery peer up without peer info. Risk in split-brain,"
+ "rebooting this node");
+ opensaf_quick_reboot("Probable split-brain due to "
+ "unknown RDE peer info");
+ }
+ }
+ }
}
}
return timeout;
@@ -251,12 +274,25 @@ void Role::ExecutePreActiveScript() {
}
void Role::AddPeer(NODE_ID node_id) {
+ TRACE_ENTER();
auto result = known_nodes_.insert(node_id);
if (result.second) {
ResetElectionTimer();
+ if (role_ == PCS_RDA_ACTIVE) {
+ ResetPeerInfoWaitTimer();
+ received_peer_info_ = false;
+ }
}
}
+void Role::RemovePeer(NODE_ID node_id) {
+ TRACE_ENTER();
+ if (received_peer_info_ == false && role_ != PCS_RDA_ACTIVE) {
+ StopPeerInfoWaitTimer();
+ }
+ known_nodes_.erase(node_id);
+}
+
// call from main thread only
bool Role::IsCandidate() {
TRACE_ENTER();
@@ -330,10 +366,24 @@ uint32_t Role::SetRole(PCS_RDA_ROLE new_role) {
}
void Role::ResetElectionTimer() {
+ TRACE_ENTER();
election_end_time_ = base::ReadMonotonicClock() +
base::MillisToTimespec(discover_peer_timeout_);
}
+void Role::ResetPeerInfoWaitTimer() {
+ TRACE_ENTER();
+ LOG_NO("Start/restart waiting peer info timer");
+ peer_info_wait_time_ = base::ReadMonotonicClock() +
+ base::MillisToTimespec(peer_info_wait_timeout_);
+}
+
+void Role::StopPeerInfoWaitTimer() {
+ TRACE_ENTER();
+ // Turn off peer_info_timer
+ received_peer_info_ = true;
+}
+
uint32_t Role::UpdateMdsRegistration(PCS_RDA_ROLE new_role,
PCS_RDA_ROLE old_role) {
uint32_t rc = NCSCC_RC_SUCCESS;
@@ -357,6 +407,7 @@ uint32_t Role::UpdateMdsRegistration(PCS_RDA_ROLE new_role,
void Role::SetPeerState(PCS_RDA_ROLE node_role, NODE_ID node_id,
uint64_t peer_promote_pending) {
+ TRACE_ENTER();
if (role() == PCS_RDA_UNDEFINED) {
bool give_up = false;
RDE_CONTROL_BLOCK *cb = rde_get_control_block();
@@ -372,6 +423,14 @@ void Role::SetPeerState(PCS_RDA_ROLE node_role, NODE_ID
node_id,
}
if (node_role == PCS_RDA_ACTIVE || node_role == PCS_RDA_STANDBY ||
give_up) {
+ // broadcast QUIESCED role to all peers to stop their waiting peer
+ // info timer
+ rde_msg peer_info_req;
+ peer_info_req.type = RDE_MSG_PEER_INFO_RESP;
+ peer_info_req.info.peer_info.ha_role = PCS_RDA_QUIESCED;
+ peer_info_req.info.peer_info.promote_pending = 0;
+ rde_mds_broadcast(&peer_info_req);
+
SetRole(PCS_RDA_QUIESCED);
LOG_NO("Giving up election against 0x%" PRIx32
" with role %s. "
@@ -379,6 +438,8 @@ void Role::SetPeerState(PCS_RDA_ROLE node_role, NODE_ID
node_id,
node_id, to_string(node_role), to_string(role()));
}
}
+ known_nodes_.insert(node_id);
+ StopPeerInfoWaitTimer();
}
void Role::PromoteNodeLate() {
diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h
index 2d24361c5..218897892 100644
--- a/src/rde/rded/role.h
+++ b/src/rde/rded/role.h
@@ -50,12 +50,16 @@ class Role {
void NodePromoted();
void PromoteNodeLate();
void RefreshConsensusState(RDE_CONTROL_BLOCK* cb);
+ void RemovePeer(NODE_ID node_id);
private:
static const uint64_t kDefaultDiscoverPeerTimeout = 2000;
+ static const uint64_t kDefaultWaitPeerInfoTimeout = 2000;
static const uint64_t kDefaultPreActiveScriptTimeout = 5000;
void ExecutePreActiveScript();
void ResetElectionTimer();
+ void ResetPeerInfoWaitTimer();
+ void StopPeerInfoWaitTimer();
uint32_t UpdateMdsRegistration(PCS_RDA_ROLE new_role, PCS_RDA_ROLE old_role);
void PromoteNode(const uint64_t cluster_size, const bool relaxed_mode);
@@ -68,6 +72,9 @@ class Role {
uint64_t pre_active_script_timeout_;
static const char* const role_names_[];
static const char* const pre_active_script_;
+ bool received_peer_info_;
+ timespec peer_info_wait_time_;
+ uint64_t peer_info_wait_timeout_;
DELETE_COPY_AND_MOVE_OPERATORS(Role);
};
--
2.20.1
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel