Replace the request-reply message exchange between RDE peers with a simpler
protocol that just sends the reply message just after receiving peer up
notifications. This will minimize the probability of trying to send to a peer
that is down. Also remove the retransmission logic that was mostly needed
because messages can be received before the node up event, which is now no
longer an issue.
---
 src/rde/rded/rde_main.cc | 26 +++++---------------------
 src/rde/rded/rde_mds.cc  | 44 +++++++++++++++++++++++---------------------
 src/rde/rded/role.cc     |  9 ++++++++-
 src/rde/rded/role.h      |  3 +++
 4 files changed, 39 insertions(+), 43 deletions(-)

diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index a4ff05806..0298bf3ff 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -42,7 +42,6 @@
 
 enum { FD_TERM = 0, FD_AMF = 1, FD_MBX, FD_RDA_SERVER, FD_CLIENT_START };
 
-static void SendPeerInfoReq(MDS_DEST mds_dest);
 static void SendPeerInfoResp(MDS_DEST mds_dest);
 static void CheckForSplitBrain(const rde_msg *msg);
 
@@ -99,19 +98,10 @@ static void handle_mbx_event() {
         Role::to_string(role->role()));
 
   switch (msg->type) {
-    case RDE_MSG_PEER_INFO_REQ: {
-      LOG_NO("Got peer info request from node 0x%x with role %s",
-             msg->fr_node_id, Role::to_string(msg->info.peer_info.ha_role));
-      CheckForSplitBrain(msg);
-      if(msg->info.peer_info.ha_role == PCS_RDA_ACTIVE
-          || msg->info.peer_info.ha_role == PCS_RDA_STANDBY) {
-        role->SetPeerState(msg->info.peer_info.ha_role, msg->fr_node_id);
-      }
-      SendPeerInfoResp(msg->fr_dest);
-      break;
-    }
+    case RDE_MSG_PEER_INFO_REQ:
     case RDE_MSG_PEER_INFO_RESP: {
-      LOG_NO("Got peer info response from node 0x%x with role %s",
+      LOG_NO("Got peer info %s from node 0x%x with role %s",
+             msg->type == RDE_MSG_PEER_INFO_RESP ? "response" : "request",
              msg->fr_node_id, Role::to_string(msg->info.peer_info.ha_role));
       CheckForSplitBrain(msg);
       role->SetPeerState(msg->info.peer_info.ha_role, msg->fr_node_id);
@@ -120,7 +110,8 @@ static void handle_mbx_event() {
     case RDE_MSG_PEER_UP: {
       if (msg->fr_node_id != own_node_id) {
         LOG_NO("Peer up on node 0x%x", msg->fr_node_id);
-        SendPeerInfoReq(msg->fr_dest);
+        SendPeerInfoResp(msg->fr_dest);
+        role->AddPeer(msg->fr_node_id);
       }
       break;
     }
@@ -145,13 +136,6 @@ static void CheckForSplitBrain(const rde_msg *msg) {
   }
 }
 
-static void SendPeerInfoReq(MDS_DEST mds_dest) {
-  rde_msg peer_info_req;
-  peer_info_req.type = RDE_MSG_PEER_INFO_REQ;
-  peer_info_req.info.peer_info.ha_role = role->role();
-  rde_mds_send(&peer_info_req, mds_dest);
-}
-
 static void SendPeerInfoResp(MDS_DEST mds_dest) {
   rde_msg peer_info_req;
   peer_info_req.type = RDE_MSG_PEER_INFO_RESP;
diff --git a/src/rde/rded/rde_mds.cc b/src/rde/rded/rde_mds.cc
index 998165c2b..5c465dc8e 100644
--- a/src/rde/rded/rde_mds.cc
+++ b/src/rde/rded/rde_mds.cc
@@ -264,27 +264,29 @@ uint32_t rde_mds_send(struct rde_msg *msg, MDS_DEST 
to_dest) {
   NCSMDS_INFO info;
   uint32_t rc;
 
-  for (int i = 0; i != 3; ++i) {
-    TRACE("Sending %s to %" PRIx64, rde_msg_name[msg->type], to_dest);
-    memset(&info, 0, sizeof(info));
-
-    info.i_mds_hdl = mds_hdl;
-    info.i_op = MDS_SEND;
-    info.i_svc_id = NCSMDS_SVC_ID_RDE;
-
-    info.info.svc_send.i_msg = msg;
-    info.info.svc_send.i_priority = MDS_SEND_PRIORITY_MEDIUM;
-    info.info.svc_send.i_sendtype = MDS_SENDTYPE_SND;
-    info.info.svc_send.i_to_svc = NCSMDS_SVC_ID_RDE;
-    info.info.svc_send.info.snd.i_to_dest = to_dest;
-
-    rc = ncsmds_api(&info);
-    if (NCSCC_RC_FAILURE == rc) {
-      LOG_WA("Failed to send %s to %" PRIx64, rde_msg_name[msg->type], 
to_dest);
-      base::Sleep(base::kOneHundredMilliseconds);
-    } else {
-      break;
-    }
+  TRACE("Sending %s to %" PRIx64, rde_msg_name[msg->type], to_dest);
+  memset(&info, 0, sizeof(info));
+
+  info.i_mds_hdl = mds_hdl;
+  info.i_op = MDS_SEND;
+  info.i_svc_id = NCSMDS_SVC_ID_RDE;
+
+  info.info.svc_send.i_msg = msg;
+  info.info.svc_send.i_priority = MDS_SEND_PRIORITY_MEDIUM;
+  info.info.svc_send.i_sendtype = MDS_SENDTYPE_SND;
+  info.info.svc_send.i_to_svc = NCSMDS_SVC_ID_RDE;
+  info.info.svc_send.info.snd.i_to_dest = to_dest;
+
+  struct timespec start_time = base::ReadMonotonicClock();
+  rc = ncsmds_api(&info);
+  struct timespec end_time = base::ReadMonotonicClock();
+  uint64_t duration = base::TimespecToMicros(end_time - start_time);
+  if (NCSCC_RC_FAILURE == rc) {
+    LOG_WA("Failed to send %s to %" PRIx64 ", and blocked for %" PRIu64 " us",
+           rde_msg_name[msg->type], to_dest, duration);
+  } else if (duration > 5000) {
+    LOG_WA("Sending %s to %" PRIx64 " blocked for %" PRIu64 " us",
+           rde_msg_name[msg->type], to_dest, duration);
   }
 
   return rc;
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index efab6723b..f7511f0d8 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -43,7 +43,8 @@ const char* Role::to_string(PCS_RDA_ROLE role) {
 }
 
 Role::Role(NODE_ID own_node_id)
-    : role_{PCS_RDA_QUIESCED},
+    : known_nodes_{},
+      role_{PCS_RDA_QUIESCED},
       own_node_id_{own_node_id},
       proc_{new base::Process()},
       election_end_time_{},
@@ -76,6 +77,11 @@ void Role::ExecutePreActiveScript() {
                  std::chrono::milliseconds(pre_active_script_timeout_));
 }
 
+void Role::AddPeer(NODE_ID node_id) {
+  auto result = known_nodes_.insert(node_id);
+  if (result.second) ResetElectionTimer();
+}
+
 uint32_t Role::SetRole(PCS_RDA_ROLE new_role) {
   PCS_RDA_ROLE old_role = role_;
   if (new_role == PCS_RDA_ACTIVE &&
@@ -88,6 +94,7 @@ uint32_t Role::SetRole(PCS_RDA_ROLE new_role) {
     if (new_role == PCS_RDA_ACTIVE) ExecutePreActiveScript();
     role_ = new_role;
     if (new_role == PCS_RDA_UNDEFINED) {
+      known_nodes_.clear();
       ResetElectionTimer();
     } else {
       rde_rda_send_role(new_role);
diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h
index 255610297..20219b535 100644
--- a/src/rde/rded/role.h
+++ b/src/rde/rded/role.h
@@ -20,6 +20,7 @@
 
 #include <time.h>
 #include <cstdint>
+#include <set>
 #include "base/macros.h"
 #include "mds/mds_papi.h"
 #include "rde/agent/rda_papi.h"
@@ -31,6 +32,7 @@ class Process;
 class Role {
  public:
   explicit Role(NODE_ID own_node_id);
+  void AddPeer(NODE_ID node_id);
   void SetPeerState(PCS_RDA_ROLE node_role, NODE_ID node_id);
   timespec* Poll(timespec* ts);
   uint32_t SetRole(PCS_RDA_ROLE new_role);
@@ -44,6 +46,7 @@ class Role {
   void ResetElectionTimer();
   uint32_t UpdateMdsRegistration(PCS_RDA_ROLE new_role, PCS_RDA_ROLE old_role);
 
+  std::set<NODE_ID> known_nodes_;
   PCS_RDA_ROLE role_;
   NODE_ID own_node_id_;
   base::Process* proc_;
-- 
2.13.3


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to