Replace the request-reply message exchange between RDE peers with a simpler
protocol that just sends the reply message just after receiving peer up
notifications. This will minimize the probability of trying to send to a peer
that is down. Also remove the retransmission logic that was mostly needed
because messages can be received before the node up event, which is now no
longer an issue.
---
src/rde/rded/rde_main.cc | 26 +++++---------------------
src/rde/rded/rde_mds.cc | 44 +++++++++++++++++++++++---------------------
src/rde/rded/role.cc | 9 ++++++++-
src/rde/rded/role.h | 3 +++
4 files changed, 39 insertions(+), 43 deletions(-)
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index a4ff05806..0298bf3ff 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -42,7 +42,6 @@
enum { FD_TERM = 0, FD_AMF = 1, FD_MBX, FD_RDA_SERVER, FD_CLIENT_START };
-static void SendPeerInfoReq(MDS_DEST mds_dest);
static void SendPeerInfoResp(MDS_DEST mds_dest);
static void CheckForSplitBrain(const rde_msg *msg);
@@ -99,19 +98,10 @@ static void handle_mbx_event() {
Role::to_string(role->role()));
switch (msg->type) {
- case RDE_MSG_PEER_INFO_REQ: {
- LOG_NO("Got peer info request from node 0x%x with role %s",
- msg->fr_node_id, Role::to_string(msg->info.peer_info.ha_role));
- CheckForSplitBrain(msg);
- if(msg->info.peer_info.ha_role == PCS_RDA_ACTIVE
- || msg->info.peer_info.ha_role == PCS_RDA_STANDBY) {
- role->SetPeerState(msg->info.peer_info.ha_role, msg->fr_node_id);
- }
- SendPeerInfoResp(msg->fr_dest);
- break;
- }
+ case RDE_MSG_PEER_INFO_REQ:
case RDE_MSG_PEER_INFO_RESP: {
- LOG_NO("Got peer info response from node 0x%x with role %s",
+ LOG_NO("Got peer info %s from node 0x%x with role %s",
+ msg->type == RDE_MSG_PEER_INFO_RESP ? "response" : "request",
msg->fr_node_id, Role::to_string(msg->info.peer_info.ha_role));
CheckForSplitBrain(msg);
role->SetPeerState(msg->info.peer_info.ha_role, msg->fr_node_id);
@@ -120,7 +110,8 @@ static void handle_mbx_event() {
case RDE_MSG_PEER_UP: {
if (msg->fr_node_id != own_node_id) {
LOG_NO("Peer up on node 0x%x", msg->fr_node_id);
- SendPeerInfoReq(msg->fr_dest);
+ SendPeerInfoResp(msg->fr_dest);
+ role->AddPeer(msg->fr_node_id);
}
break;
}
@@ -145,13 +136,6 @@ static void CheckForSplitBrain(const rde_msg *msg) {
}
}
-static void SendPeerInfoReq(MDS_DEST mds_dest) {
- rde_msg peer_info_req;
- peer_info_req.type = RDE_MSG_PEER_INFO_REQ;
- peer_info_req.info.peer_info.ha_role = role->role();
- rde_mds_send(&peer_info_req, mds_dest);
-}
-
static void SendPeerInfoResp(MDS_DEST mds_dest) {
rde_msg peer_info_req;
peer_info_req.type = RDE_MSG_PEER_INFO_RESP;
diff --git a/src/rde/rded/rde_mds.cc b/src/rde/rded/rde_mds.cc
index 998165c2b..5c465dc8e 100644
--- a/src/rde/rded/rde_mds.cc
+++ b/src/rde/rded/rde_mds.cc
@@ -264,27 +264,29 @@ uint32_t rde_mds_send(struct rde_msg *msg, MDS_DEST
to_dest) {
NCSMDS_INFO info;
uint32_t rc;
- for (int i = 0; i != 3; ++i) {
- TRACE("Sending %s to %" PRIx64, rde_msg_name[msg->type], to_dest);
- memset(&info, 0, sizeof(info));
-
- info.i_mds_hdl = mds_hdl;
- info.i_op = MDS_SEND;
- info.i_svc_id = NCSMDS_SVC_ID_RDE;
-
- info.info.svc_send.i_msg = msg;
- info.info.svc_send.i_priority = MDS_SEND_PRIORITY_MEDIUM;
- info.info.svc_send.i_sendtype = MDS_SENDTYPE_SND;
- info.info.svc_send.i_to_svc = NCSMDS_SVC_ID_RDE;
- info.info.svc_send.info.snd.i_to_dest = to_dest;
-
- rc = ncsmds_api(&info);
- if (NCSCC_RC_FAILURE == rc) {
- LOG_WA("Failed to send %s to %" PRIx64, rde_msg_name[msg->type],
to_dest);
- base::Sleep(base::kOneHundredMilliseconds);
- } else {
- break;
- }
+ TRACE("Sending %s to %" PRIx64, rde_msg_name[msg->type], to_dest);
+ memset(&info, 0, sizeof(info));
+
+ info.i_mds_hdl = mds_hdl;
+ info.i_op = MDS_SEND;
+ info.i_svc_id = NCSMDS_SVC_ID_RDE;
+
+ info.info.svc_send.i_msg = msg;
+ info.info.svc_send.i_priority = MDS_SEND_PRIORITY_MEDIUM;
+ info.info.svc_send.i_sendtype = MDS_SENDTYPE_SND;
+ info.info.svc_send.i_to_svc = NCSMDS_SVC_ID_RDE;
+ info.info.svc_send.info.snd.i_to_dest = to_dest;
+
+ struct timespec start_time = base::ReadMonotonicClock();
+ rc = ncsmds_api(&info);
+ struct timespec end_time = base::ReadMonotonicClock();
+ uint64_t duration = base::TimespecToMicros(end_time - start_time);
+ if (NCSCC_RC_FAILURE == rc) {
+ LOG_WA("Failed to send %s to %" PRIx64 ", and blocked for %" PRIu64 " us",
+ rde_msg_name[msg->type], to_dest, duration);
+ } else if (duration > 5000) {
+ LOG_WA("Sending %s to %" PRIx64 " blocked for %" PRIu64 " us",
+ rde_msg_name[msg->type], to_dest, duration);
}
return rc;
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index efab6723b..f7511f0d8 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -43,7 +43,8 @@ const char* Role::to_string(PCS_RDA_ROLE role) {
}
Role::Role(NODE_ID own_node_id)
- : role_{PCS_RDA_QUIESCED},
+ : known_nodes_{},
+ role_{PCS_RDA_QUIESCED},
own_node_id_{own_node_id},
proc_{new base::Process()},
election_end_time_{},
@@ -76,6 +77,11 @@ void Role::ExecutePreActiveScript() {
std::chrono::milliseconds(pre_active_script_timeout_));
}
+void Role::AddPeer(NODE_ID node_id) {
+ auto result = known_nodes_.insert(node_id);
+ if (result.second) ResetElectionTimer();
+}
+
uint32_t Role::SetRole(PCS_RDA_ROLE new_role) {
PCS_RDA_ROLE old_role = role_;
if (new_role == PCS_RDA_ACTIVE &&
@@ -88,6 +94,7 @@ uint32_t Role::SetRole(PCS_RDA_ROLE new_role) {
if (new_role == PCS_RDA_ACTIVE) ExecutePreActiveScript();
role_ = new_role;
if (new_role == PCS_RDA_UNDEFINED) {
+ known_nodes_.clear();
ResetElectionTimer();
} else {
rde_rda_send_role(new_role);
diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h
index 255610297..20219b535 100644
--- a/src/rde/rded/role.h
+++ b/src/rde/rded/role.h
@@ -20,6 +20,7 @@
#include <time.h>
#include <cstdint>
+#include <set>
#include "base/macros.h"
#include "mds/mds_papi.h"
#include "rde/agent/rda_papi.h"
@@ -31,6 +32,7 @@ class Process;
class Role {
public:
explicit Role(NODE_ID own_node_id);
+ void AddPeer(NODE_ID node_id);
void SetPeerState(PCS_RDA_ROLE node_role, NODE_ID node_id);
timespec* Poll(timespec* ts);
uint32_t SetRole(PCS_RDA_ROLE new_role);
@@ -44,6 +46,7 @@ class Role {
void ResetElectionTimer();
uint32_t UpdateMdsRegistration(PCS_RDA_ROLE new_role, PCS_RDA_ROLE old_role);
+ std::set<NODE_ID> known_nodes_;
PCS_RDA_ROLE role_;
NODE_ID own_node_id_;
base::Process* proc_;
--
2.13.3
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel