Replace the request-reply message exchange between RDE peers with a simpler protocol that just sends the reply message just after receiving peer up notifications. This will minimize the probability of trying to send to a peer that is down. Also remove the retransmission logic that was mostly needed because messages can be received before the node up event, which is now no longer an issue. --- src/rde/rded/rde_main.cc | 26 +++++--------------------- src/rde/rded/rde_mds.cc | 44 +++++++++++++++++++++++--------------------- src/rde/rded/role.cc | 9 ++++++++- src/rde/rded/role.h | 3 +++ 4 files changed, 39 insertions(+), 43 deletions(-)
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc index a4ff05806..0298bf3ff 100644 --- a/src/rde/rded/rde_main.cc +++ b/src/rde/rded/rde_main.cc @@ -42,7 +42,6 @@ enum { FD_TERM = 0, FD_AMF = 1, FD_MBX, FD_RDA_SERVER, FD_CLIENT_START }; -static void SendPeerInfoReq(MDS_DEST mds_dest); static void SendPeerInfoResp(MDS_DEST mds_dest); static void CheckForSplitBrain(const rde_msg *msg); @@ -99,19 +98,10 @@ static void handle_mbx_event() { Role::to_string(role->role())); switch (msg->type) { - case RDE_MSG_PEER_INFO_REQ: { - LOG_NO("Got peer info request from node 0x%x with role %s", - msg->fr_node_id, Role::to_string(msg->info.peer_info.ha_role)); - CheckForSplitBrain(msg); - if(msg->info.peer_info.ha_role == PCS_RDA_ACTIVE - || msg->info.peer_info.ha_role == PCS_RDA_STANDBY) { - role->SetPeerState(msg->info.peer_info.ha_role, msg->fr_node_id); - } - SendPeerInfoResp(msg->fr_dest); - break; - } + case RDE_MSG_PEER_INFO_REQ: case RDE_MSG_PEER_INFO_RESP: { - LOG_NO("Got peer info response from node 0x%x with role %s", + LOG_NO("Got peer info %s from node 0x%x with role %s", + msg->type == RDE_MSG_PEER_INFO_RESP ? "response" : "request", msg->fr_node_id, Role::to_string(msg->info.peer_info.ha_role)); CheckForSplitBrain(msg); role->SetPeerState(msg->info.peer_info.ha_role, msg->fr_node_id); @@ -120,7 +110,8 @@ static void handle_mbx_event() { case RDE_MSG_PEER_UP: { if (msg->fr_node_id != own_node_id) { LOG_NO("Peer up on node 0x%x", msg->fr_node_id); - SendPeerInfoReq(msg->fr_dest); + SendPeerInfoResp(msg->fr_dest); + role->AddPeer(msg->fr_node_id); } break; } @@ -145,13 +136,6 @@ static void CheckForSplitBrain(const rde_msg *msg) { } } -static void SendPeerInfoReq(MDS_DEST mds_dest) { - rde_msg peer_info_req; - peer_info_req.type = RDE_MSG_PEER_INFO_REQ; - peer_info_req.info.peer_info.ha_role = role->role(); - rde_mds_send(&peer_info_req, mds_dest); -} - static void SendPeerInfoResp(MDS_DEST mds_dest) { rde_msg peer_info_req; peer_info_req.type = RDE_MSG_PEER_INFO_RESP; diff --git a/src/rde/rded/rde_mds.cc b/src/rde/rded/rde_mds.cc index 998165c2b..5c465dc8e 100644 --- a/src/rde/rded/rde_mds.cc +++ b/src/rde/rded/rde_mds.cc @@ -264,27 +264,29 @@ uint32_t rde_mds_send(struct rde_msg *msg, MDS_DEST to_dest) { NCSMDS_INFO info; uint32_t rc; - for (int i = 0; i != 3; ++i) { - TRACE("Sending %s to %" PRIx64, rde_msg_name[msg->type], to_dest); - memset(&info, 0, sizeof(info)); - - info.i_mds_hdl = mds_hdl; - info.i_op = MDS_SEND; - info.i_svc_id = NCSMDS_SVC_ID_RDE; - - info.info.svc_send.i_msg = msg; - info.info.svc_send.i_priority = MDS_SEND_PRIORITY_MEDIUM; - info.info.svc_send.i_sendtype = MDS_SENDTYPE_SND; - info.info.svc_send.i_to_svc = NCSMDS_SVC_ID_RDE; - info.info.svc_send.info.snd.i_to_dest = to_dest; - - rc = ncsmds_api(&info); - if (NCSCC_RC_FAILURE == rc) { - LOG_WA("Failed to send %s to %" PRIx64, rde_msg_name[msg->type], to_dest); - base::Sleep(base::kOneHundredMilliseconds); - } else { - break; - } + TRACE("Sending %s to %" PRIx64, rde_msg_name[msg->type], to_dest); + memset(&info, 0, sizeof(info)); + + info.i_mds_hdl = mds_hdl; + info.i_op = MDS_SEND; + info.i_svc_id = NCSMDS_SVC_ID_RDE; + + info.info.svc_send.i_msg = msg; + info.info.svc_send.i_priority = MDS_SEND_PRIORITY_MEDIUM; + info.info.svc_send.i_sendtype = MDS_SENDTYPE_SND; + info.info.svc_send.i_to_svc = NCSMDS_SVC_ID_RDE; + info.info.svc_send.info.snd.i_to_dest = to_dest; + + struct timespec start_time = base::ReadMonotonicClock(); + rc = ncsmds_api(&info); + struct timespec end_time = base::ReadMonotonicClock(); + uint64_t duration = base::TimespecToMicros(end_time - start_time); + if (NCSCC_RC_FAILURE == rc) { + LOG_WA("Failed to send %s to %" PRIx64 ", and blocked for %" PRIu64 " us", + rde_msg_name[msg->type], to_dest, duration); + } else if (duration > 5000) { + LOG_WA("Sending %s to %" PRIx64 " blocked for %" PRIu64 " us", + rde_msg_name[msg->type], to_dest, duration); } return rc; diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc index efab6723b..f7511f0d8 100644 --- a/src/rde/rded/role.cc +++ b/src/rde/rded/role.cc @@ -43,7 +43,8 @@ const char* Role::to_string(PCS_RDA_ROLE role) { } Role::Role(NODE_ID own_node_id) - : role_{PCS_RDA_QUIESCED}, + : known_nodes_{}, + role_{PCS_RDA_QUIESCED}, own_node_id_{own_node_id}, proc_{new base::Process()}, election_end_time_{}, @@ -76,6 +77,11 @@ void Role::ExecutePreActiveScript() { std::chrono::milliseconds(pre_active_script_timeout_)); } +void Role::AddPeer(NODE_ID node_id) { + auto result = known_nodes_.insert(node_id); + if (result.second) ResetElectionTimer(); +} + uint32_t Role::SetRole(PCS_RDA_ROLE new_role) { PCS_RDA_ROLE old_role = role_; if (new_role == PCS_RDA_ACTIVE && @@ -88,6 +94,7 @@ uint32_t Role::SetRole(PCS_RDA_ROLE new_role) { if (new_role == PCS_RDA_ACTIVE) ExecutePreActiveScript(); role_ = new_role; if (new_role == PCS_RDA_UNDEFINED) { + known_nodes_.clear(); ResetElectionTimer(); } else { rde_rda_send_role(new_role); diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h index 255610297..20219b535 100644 --- a/src/rde/rded/role.h +++ b/src/rde/rded/role.h @@ -20,6 +20,7 @@ #include <time.h> #include <cstdint> +#include <set> #include "base/macros.h" #include "mds/mds_papi.h" #include "rde/agent/rda_papi.h" @@ -31,6 +32,7 @@ class Process; class Role { public: explicit Role(NODE_ID own_node_id); + void AddPeer(NODE_ID node_id); void SetPeerState(PCS_RDA_ROLE node_role, NODE_ID node_id); timespec* Poll(timespec* ts); uint32_t SetRole(PCS_RDA_ROLE new_role); @@ -44,6 +46,7 @@ class Role { void ResetElectionTimer(); uint32_t UpdateMdsRegistration(PCS_RDA_ROLE new_role, PCS_RDA_ROLE old_role); + std::set<NODE_ID> known_nodes_; PCS_RDA_ROLE role_; NODE_ID own_node_id_; base::Process* proc_; -- 2.13.3 ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel