RDE detects the peer_up message and suppose the peer_info message will come afterwards. However, in roaming SC, when all SCs rejoins from network split, the last active SC may be missing out the peer info message since the others SC have already reboot.
Patch adds timeout to wait for peer info message to avoid a risk of missing peer info message to detect duplicated active SC. The new timeout is used for all peers, meaning that the timeout reset for each peer up message and wait for the last peer info message. --- src/rde/rded/role.cc | 46 +++++++++++++++++++++++++++++++++++++++++++- src/rde/rded/role.h | 6 ++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc index 3732be449..464813482 100644 --- a/src/rde/rded/role.cc +++ b/src/rde/rded/role.cc @@ -196,9 +196,13 @@ Role::Role(NODE_ID own_node_id) discover_peer_timeout_{base::GetEnv("RDE_DISCOVER_PEER_TIMEOUT", kDefaultDiscoverPeerTimeout)}, pre_active_script_timeout_{base::GetEnv( - "RDE_PRE_ACTIVE_SCRIPT_TIMEOUT", kDefaultPreActiveScriptTimeout)} {} + "RDE_PRE_ACTIVE_SCRIPT_TIMEOUT", kDefaultPreActiveScriptTimeout)}, + received_peer_info_{true}, + peer_info_wait_time_{}, + peer_info_wait_timeout_ {kDefaultWaitPeerInfoTimeout} {} timespec* Role::Poll(timespec* ts) { + TRACE_ENTER(); timespec* timeout = nullptr; if (role_ == PCS_RDA_UNDEFINED) { timespec now = base::ReadMonotonicClock(); @@ -238,6 +242,25 @@ timespec* Role::Poll(timespec* ts) { cb->state_refresh_thread_started = true; std::thread(&Role::RefreshConsensusState, this, cb).detach(); } + if (consensus_service.IsEnabled() == false) { + // We are already ACTIVE, and has just discovered a new node + // which makes the election_end_time_ reset + if (received_peer_info_ == false) { + timespec now = base::ReadMonotonicClock(); + if (peer_info_wait_time_ >= now) { + *ts = peer_info_wait_time_ - now; + timeout = ts; + } else { + // Timeout but haven't received peer info + // The peer RDE could be in ACTIVE + // thus self-fence to avoid split-brain risk + LOG_ER("Discovery peer up without peer info. Risk in split-brain," + "rebooting this node"); + opensaf_quick_reboot("Probable split-brain due to " + "unknown RDE peer info"); + } + } + } } } return timeout; @@ -251,9 +274,14 @@ void Role::ExecutePreActiveScript() { } void Role::AddPeer(NODE_ID node_id) { + TRACE_ENTER(); auto result = known_nodes_.insert(node_id); if (result.second) { ResetElectionTimer(); + if (role_ == PCS_RDA_ACTIVE) { + ResetPeerInfoWaitTimer(); + received_peer_info_ = false; + } } } @@ -330,10 +358,24 @@ uint32_t Role::SetRole(PCS_RDA_ROLE new_role) { } void Role::ResetElectionTimer() { + TRACE_ENTER(); election_end_time_ = base::ReadMonotonicClock() + base::MillisToTimespec(discover_peer_timeout_); } +void Role::ResetPeerInfoWaitTimer() { + TRACE_ENTER(); + // Reuse peer discovery timeout + peer_info_wait_time_ = base::ReadMonotonicClock() + + base::MillisToTimespec(peer_info_wait_timeout_); +} + +void Role::StopPeerInfoWaitTimer() { + TRACE_ENTER(); + // Turn off peer_info_timer + received_peer_info_ = true; +} + uint32_t Role::UpdateMdsRegistration(PCS_RDA_ROLE new_role, PCS_RDA_ROLE old_role) { uint32_t rc = NCSCC_RC_SUCCESS; @@ -357,6 +399,7 @@ uint32_t Role::UpdateMdsRegistration(PCS_RDA_ROLE new_role, void Role::SetPeerState(PCS_RDA_ROLE node_role, NODE_ID node_id, uint64_t peer_promote_pending) { + TRACE_ENTER(); if (role() == PCS_RDA_UNDEFINED) { bool give_up = false; RDE_CONTROL_BLOCK *cb = rde_get_control_block(); @@ -379,6 +422,7 @@ void Role::SetPeerState(PCS_RDA_ROLE node_role, NODE_ID node_id, node_id, to_string(node_role), to_string(role())); } } + StopPeerInfoWaitTimer(); } void Role::PromoteNodeLate() { diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h index 2d24361c5..1fb84d1a8 100644 --- a/src/rde/rded/role.h +++ b/src/rde/rded/role.h @@ -53,9 +53,12 @@ class Role { private: static const uint64_t kDefaultDiscoverPeerTimeout = 2000; + static const uint64_t kDefaultWaitPeerInfoTimeout = 5000; static const uint64_t kDefaultPreActiveScriptTimeout = 5000; void ExecutePreActiveScript(); void ResetElectionTimer(); + void ResetPeerInfoWaitTimer(); + void StopPeerInfoWaitTimer(); uint32_t UpdateMdsRegistration(PCS_RDA_ROLE new_role, PCS_RDA_ROLE old_role); void PromoteNode(const uint64_t cluster_size, const bool relaxed_mode); @@ -68,6 +71,9 @@ class Role { uint64_t pre_active_script_timeout_; static const char* const role_names_[]; static const char* const pre_active_script_; + bool received_peer_info_; + timespec peer_info_wait_time_; + uint64_t peer_info_wait_timeout_; DELETE_COPY_AND_MOVE_OPERATORS(Role); }; -- 2.20.1 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel