This ticket revisit the waiting for peer info and
fix the problem of disordered peer_up and peer info
in the commit d1593b03b3c9bec292b14dde65264c261760bf46
---
src/rde/rded/rde_main.cc | 1 +
src/rde/rded/role.cc | 63 +++-
src/rde/rded/role.h | 7 +
3 files changed, 70 insertions(+), 1 deletion(-)
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index 8ed6b046e..33dd645e2 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -125,6 +125,7 @@ static void handle_mbx_event() {
}
case RDE_MSG_PEER_DOWN:
LOG_NO("Peer down on node 0x%x", msg->fr_node_id);
+ role->RemovePeer(msg->fr_node_id);
break;
case RDE_MSG_NEW_ACTIVE_CALLBACK: {
const std::string my_node = base::Conf::NodeName();
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index 3732be449..344702e63 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -196,9 +196,13 @@ Role::Role(NODE_ID own_node_id)
discover_peer_timeout_{base::GetEnv("RDE_DISCOVER_PEER_TIMEOUT",
kDefaultDiscoverPeerTimeout)},
pre_active_script_timeout_{base::GetEnv(
- "RDE_PRE_ACTIVE_SCRIPT_TIMEOUT", kDefaultPreActiveScriptTimeout)} {}
+ "RDE_PRE_ACTIVE_SCRIPT_TIMEOUT", kDefaultPreActiveScriptTimeout)},
+ received_peer_info_{true},
+ peer_info_wait_time_{},
+ peer_info_wait_timeout_ {kDefaultWaitPeerInfoTimeout} {}
timespec* Role::Poll(timespec* ts) {
+ TRACE_ENTER();
timespec* timeout = nullptr;
if (role_ == PCS_RDA_UNDEFINED) {
timespec now = base::ReadMonotonicClock();
@@ -238,6 +242,25 @@ timespec* Role::Poll(timespec* ts) {
cb->state_refresh_thread_started = true;
std::thread(::RefreshConsensusState, this, cb).detach();
}
+ if (consensus_service.IsEnabled() == false) {
+// We are already ACTIVE, and has just discovered a new node
+// which makes the election_end_time_ reset
+if (received_peer_info_ == false) {
+ timespec now = base::ReadMonotonicClock();
+ if (peer_info_wait_time_ >= now) {
+*ts = peer_info_wait_time_ - now;
+timeout = ts;
+ } else {
+// Timeout but haven't received peer info
+// The peer RDE could be in ACTIVE
+// thus self-fence to avoid split-brain risk
+LOG_ER("Discovery peer up without peer info. Risk in split-brain,"
+"rebooting this node");
+opensaf_quick_reboot("Probable split-brain due to "
+"unknown RDE peer info");
+ }
+}
+ }
}
}
return timeout;
@@ -251,12 +274,25 @@ void Role::ExecutePreActiveScript() {
}
void Role::AddPeer(NODE_ID node_id) {
+ TRACE_ENTER();
auto result = known_nodes_.insert(node_id);
if (result.second) {
ResetElectionTimer();
+if (role_ == PCS_RDA_ACTIVE) {
+ ResetPeerInfoWaitTimer();
+ received_peer_info_ = false;
+}
}
}
+void Role::RemovePeer(NODE_ID node_id) {
+ TRACE_ENTER();
+ if (received_peer_info_ == false && role_ != PCS_RDA_ACTIVE) {
+StopPeerInfoWaitTimer();
+ }
+ known_nodes_.erase(node_id);
+}
+
// call from main thread only
bool Role::IsCandidate() {
TRACE_ENTER();
@@ -330,10 +366,24 @@ uint32_t Role::SetRole(PCS_RDA_ROLE new_role) {
}
void Role::ResetElectionTimer() {
+ TRACE_ENTER();
election_end_time_ = base::ReadMonotonicClock() +
base::MillisToTimespec(discover_peer_timeout_);
}
+void Role::ResetPeerInfoWaitTimer() {
+ TRACE_ENTER();
+ LOG_NO("Start/restart waiting peer info timer");
+ peer_info_wait_time_ = base::ReadMonotonicClock() +
+ base::MillisToTimespec(peer_info_wait_timeout_);
+}
+
+void Role::StopPeerInfoWaitTimer() {
+ TRACE_ENTER();
+ // Turn off peer_info_timer
+ received_peer_info_ = true;
+}
+
uint32_t Role::UpdateMdsRegistration(PCS_RDA_ROLE new_role,
PCS_RDA_ROLE old_role) {
uint32_t rc = NCSCC_RC_SUCCESS;
@@ -357,6 +407,7 @@ uint32_t Role::UpdateMdsRegistration(PCS_RDA_ROLE new_role,
void Role::SetPeerState(PCS_RDA_ROLE node_role, NODE_ID node_id,
uint64_t peer_promote_pending) {
+ TRACE_ENTER();
if (role() == PCS_RDA_UNDEFINED) {
bool give_up = false;
RDE_CONTROL_BLOCK *cb = rde_get_control_block();
@@ -372,6 +423,14 @@ void Role::SetPeerState(PCS_RDA_ROLE node_role, NODE_ID
node_id,
}
if (node_role == PCS_RDA_ACTIVE || node_role == PCS_RDA_STANDBY ||
give_up) {
+ // broadcast QUIESCED role to all peers to stop their waiting peer
+ // info timer
+ rde_msg peer_info_req;
+ peer_info_req.type = RDE_MSG_PEER_INFO_RESP;
+ peer_info_req.info.peer_info.ha_role = PCS_RDA_QUIESCED;
+ peer_info_req.info.peer_info.promote_pending = 0;
+ rde_mds_broadcast(_info_req);
+