Allow promotion of node to active at cluster startup, even if the
consensus service is unavailable, if the peer SC can be seen.

During normal cluster operation, if the consensus service becomes
unavailable but the peer SC can still be seen, allow the existing
active SC to remain active.

A new NCSMDS_SVC_ID_RDE_DISCOVERY service ID is exported by rded.
This is installed as soon as rded is started, unlike
NCSMDS_SVC_ID_RDE which is only installed when it becomes
a candidate for election.
---
 src/mds/mds_papi.h       |  1 +
 src/rde/rded/rde_cb.h    | 12 +++++-
 src/rde/rded/rde_main.cc | 71 +++++++++++++++++++++++++++++++----
 src/rde/rded/rde_mds.cc  | 94 ++++++++++++++++++++++++++++++++++++++++++++--
 src/rde/rded/role.cc     | 97 +++++++++++++++++++++++++++++++++++++++++++-----
 src/rde/rded/role.h      |  4 +-
 6 files changed, 256 insertions(+), 23 deletions(-)

diff --git a/src/mds/mds_papi.h b/src/mds/mds_papi.h
index 03d755d..7cd543c 100644
--- a/src/mds/mds_papi.h
+++ b/src/mds/mds_papi.h
@@ -191,6 +191,7 @@ typedef enum ncsmds_svc_id {
   NCSMDS_SVC_ID_PLMS = 37,
   NCSMDS_SVC_ID_PLMS_HRB = 38,
   NCSMDS_SVC_ID_PLMA = 39,
+  NCSMDS_SVC_ID_RDE_DISCOVERY = 40,
   NCSMDS_SVC_ID_NCSMAX, /* This mnemonic always last */
 
   /* The range below is for OpenSAF internal use */
diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h
index d3f5a24..9a0919c 100644
--- a/src/rde/rded/rde_cb.h
+++ b/src/rde/rded/rde_cb.h
@@ -34,6 +34,9 @@
  **
  */
 
+enum class State {kNotActive = 0, kNotActiveSeenPeer, kActiveElected,
+                  kActiveElectedSeenPeer, kActiveFailover};
+
 struct RDE_CONTROL_BLOCK {
   SYSF_MBX mbx;
   NCSCONTEXT task_handle;
@@ -43,6 +46,9 @@ struct RDE_CONTROL_BLOCK {
   bool monitor_lock_thread_running{false};
   bool monitor_takeover_req_thread_running{false};
   std::set<NODE_ID> cluster_members{};
+  // used for discovering peer controllers, regardless of their role
+  std::set<NODE_ID> peer_controllers{};
+  State state{State::kNotActive};
 };
 
 enum RDE_MSG_TYPE {
@@ -54,7 +60,9 @@ enum RDE_MSG_TYPE {
   RDE_MSG_NODE_UP = 6,
   RDE_MSG_NODE_DOWN = 7,
   RDE_MSG_TAKEOVER_REQUEST_CALLBACK = 8,
-  RDE_MSG_ACTIVE_PROMOTION_SUCCESS = 9
+  RDE_MSG_ACTIVE_PROMOTION_SUCCESS = 9,
+  RDE_MSG_CONTROLLER_UP = 10,
+  RDE_MSG_CONTROLLER_DOWN = 11
 };
 
 struct rde_peer_info {
@@ -82,7 +90,9 @@ extern const char *rde_msg_name[];
 
 extern RDE_CONTROL_BLOCK *rde_get_control_block();
 extern uint32_t rde_mds_register();
+extern uint32_t rde_discovery_mds_register();
 extern uint32_t rde_mds_unregister();
+extern uint32_t rde_discovery_mds_unregister();
 extern uint32_t rde_mds_send(rde_msg *msg, MDS_DEST to_dest);
 extern uint32_t rde_set_role(PCS_RDA_ROLE role);
 
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index e5813e4..2d9aa51 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -39,6 +39,7 @@
 #include "osaf/consensus/consensus.h"
 #include "rde/rded/rde_cb.h"
 #include "rde/rded/role.h"
+#include "rde_cb.h"
 
 #define RDA_MAX_CLIENTS 32
 
@@ -56,7 +57,9 @@ const char *rde_msg_name[] = {"-",
                               "RDE_MSG_NODE_UP(6)",
                               "RDE_MSG_NODE_DOWN(7)",
                               "RDE_MSG_TAKEOVER_REQUEST_CALLBACK(8)",
-                              "RDE_MSG_ACTIVE_PROMOTION_SUCCESS(9)"};
+                              "RDE_MSG_ACTIVE_PROMOTION_SUCCESS(9)",
+                              "RDE_MSG_CONTROLLER_UP(10)",
+                              "RDE_MSG_CONTROLLER_DOWN(11)"};
 
 static RDE_CONTROL_BLOCK _rde_cb;
 static RDE_CONTROL_BLOCK *rde_cb = &_rde_cb;
@@ -157,6 +160,23 @@ static void handle_mbx_event() {
       rde_cb->cluster_members.erase(msg->fr_node_id);
       TRACE("cluster_size %zu", rde_cb->cluster_members.size());
       break;
+    case RDE_MSG_CONTROLLER_UP:
+      if (msg->fr_node_id != own_node_id) {
+        rde_cb->peer_controllers.insert(msg->fr_node_id);
+        TRACE("peer_controllers: size %zu", rde_cb->peer_controllers.size());
+        if (rde_cb->state == State::kNotActive) {
+          TRACE("Set state to kNotActiveSeenPeer");
+          rde_cb->state = State::kNotActiveSeenPeer;
+        } else if (rde_cb->state == State::kActiveElected) {
+          TRACE("Set state to kActiveElectedSeenPeer");
+          rde_cb->state = State::kActiveElectedSeenPeer;
+        }
+      }
+      break;
+    case RDE_MSG_CONTROLLER_DOWN:
+      rde_cb->peer_controllers.erase(msg->fr_node_id);
+      TRACE("peer_controllers: size %zu", rde_cb->peer_controllers.size());
+      break;
     case RDE_MSG_TAKEOVER_REQUEST_CALLBACK: {
       rde_cb->monitor_takeover_req_thread_running = false;
 
@@ -179,13 +199,44 @@ static void handle_mbx_event() {
                            "Another controller is taking over the active role. 
"
                            "Rebooting this node");
           }
-        } else {
-          LOG_NO("Rejected takeover request");
-
-          rde_cb->monitor_takeover_req_thread_running = true;
-          consensus_service.MonitorTakeoverRequest(Role::MonitorCallback,
-                                                   rde_cb->mbx);
+        } else if (state == Consensus::TakeoverState::UNDEFINED) {
+          bool fencing_required = true;
+
+          // differentiate when this occurs after election or
+          // rde has been set active due to failover
+          if (consensus_service.IsRelaxedNodePromotionEnabled() == true) {
+              if (rde_cb->state == State::kActiveElected) {
+                TRACE("Relaxed mode is enabled");
+                TRACE(" No peer SC yet seen, ignore consensus service 
failure");
+                // if relaxed node promotion is enabled, and we have yet to see
+                // a peer SC after being promoted, tolerate consensus service
+                // not working
+                fencing_required = false;
+              } else if ((rde_cb->state == State::kActiveElectedSeenPeer ||
+                         rde_cb->state == State::kActiveFailover) &&
+                         role->IsPeerPresent() == true) {
+                TRACE("Relaxed mode is enabled");
+                TRACE("Peer SC can be seen, ignore consensus service failure");
+                // we have seen the peer, and peer is still connected, tolerate
+                // consensus service not working
+                fencing_required = false;
+              }
+          }
+          if (fencing_required == true) {
+            LOG_NO("Lost connectivity to consensus service");
+            if (consensus_service.IsRemoteFencingEnabled() == false) {
+                opensaf_reboot(0, nullptr,
+                               "Lost connectivity to consensus service. "
+                               "Rebooting this node");
+            }
+          }
         }
+
+        LOG_NO("Rejected takeover request");
+
+        rde_cb->monitor_takeover_req_thread_running = true;
+        consensus_service.MonitorTakeoverRequest(Role::MonitorCallback,
+                                                 rde_cb->mbx);
       } else {
         LOG_WA("Received takeover request when not active");
       }
@@ -267,6 +318,11 @@ static int initialize_rde() {
     goto init_failed;
   }
 
+  if (rde_discovery_mds_register() != NCSCC_RC_SUCCESS) {
+    LOG_ER("rde_discovery_mds_register() failed");
+    rc = NCSCC_RC_FAILURE;
+  }
+
   rc = NCSCC_RC_SUCCESS;
 
 init_failed:
@@ -343,6 +399,7 @@ int main(int argc, char *argv[]) {
     }
 
     if (fds[FD_TERM].revents & POLLIN) {
+      rde_discovery_mds_unregister();
       daemon_exit();
     }
 
diff --git a/src/rde/rded/rde_mds.cc b/src/rde/rded/rde_mds.cc
index 00922ea..bc335f0 100644
--- a/src/rde/rded/rde_mds.cc
+++ b/src/rde/rded/rde_mds.cc
@@ -149,6 +149,31 @@ static uint32_t process_amfnd_mds_evt(struct 
ncsmds_callback_info *info) {
   return rc;
 }
 
+static uint32_t process_rde_discovery_mds_evt(
+  struct ncsmds_callback_info *info) {
+  uint32_t rc = NCSCC_RC_SUCCESS;
+
+  TRACE_ENTER();
+  osafassert(info->info.svc_evt.i_svc_id == NCSMDS_SVC_ID_RDE_DISCOVERY);
+
+  // process these events in the main thread to avoid
+  // synchronisation issues
+  switch (info->info.svc_evt.i_change) {
+    case NCSMDS_DOWN:
+      rc = mbx_send(RDE_MSG_CONTROLLER_DOWN, info->info.svc_evt.i_dest,
+                    info->info.svc_evt.i_node_id);
+      break;
+    case NCSMDS_UP:
+      rc = mbx_send(RDE_MSG_CONTROLLER_UP, info->info.svc_evt.i_dest,
+                    info->info.svc_evt.i_node_id);
+      break;
+    default:
+      break;
+  }
+
+  return rc;
+}
+
 static uint32_t mds_callback(struct ncsmds_callback_info *info) {
   struct rde_msg *msg;
   uint32_t rc = NCSCC_RC_SUCCESS;
@@ -185,8 +210,10 @@ static uint32_t mds_callback(struct ncsmds_callback_info 
*info) {
       if (info->info.svc_evt.i_svc_id == NCSMDS_SVC_ID_AVND) {
         rc = process_amfnd_mds_evt(info);
         break;
-      }
-      if (info->info.svc_evt.i_change == NCSMDS_DOWN) {
+      } else if (info->info.svc_evt.i_svc_id == NCSMDS_SVC_ID_RDE_DISCOVERY) {
+        rc = process_rde_discovery_mds_evt(info);
+        break;
+      } else if (info->info.svc_evt.i_change == NCSMDS_DOWN) {
         TRACE("MDS DOWN dest: %" PRIx64 ", node ID: %x, svc_id: %d",
               info->info.svc_evt.i_dest, info->info.svc_evt.i_node_id,
               info->info.svc_evt.i_svc_id);
@@ -218,7 +245,8 @@ done:
 uint32_t rde_mds_register() {
   NCSADA_INFO ada_info;
   NCSMDS_INFO svc_info;
-  MDS_SVC_ID svc_id[] = {NCSMDS_SVC_ID_RDE, NCSMDS_SVC_ID_AVND};
+  MDS_SVC_ID svc_id[] = {NCSMDS_SVC_ID_RDE, NCSMDS_SVC_ID_AVND,
+                         NCSMDS_SVC_ID_RDE_DISCOVERY};
   MDS_DEST mds_adest;
 
   TRACE_ENTER();
@@ -252,7 +280,7 @@ uint32_t rde_mds_register() {
   svc_info.i_mds_hdl = mds_hdl;
   svc_info.i_svc_id = NCSMDS_SVC_ID_RDE;
   svc_info.i_op = MDS_RED_SUBSCRIBE;
-  svc_info.info.svc_subscribe.i_num_svcs = 2;
+  svc_info.info.svc_subscribe.i_num_svcs = 3;
   svc_info.info.svc_subscribe.i_scope = NCSMDS_SCOPE_NONE;
   svc_info.info.svc_subscribe.i_svc_ids = svc_id;
 
@@ -266,6 +294,43 @@ uint32_t rde_mds_register() {
   return NCSCC_RC_SUCCESS;
 }
 
+uint32_t rde_discovery_mds_register() {
+  NCSADA_INFO ada_info;
+  NCSMDS_INFO svc_info;
+  MDS_DEST mds_adest;
+
+  TRACE_ENTER();
+
+  ada_info.req = NCSADA_GET_HDLS;
+  if (ncsada_api(&ada_info) != NCSCC_RC_SUCCESS) {
+    LOG_ER("%s: NCSADA_GET_HDLS Failed", __FUNCTION__);
+    return NCSCC_RC_FAILURE;
+  }
+
+  mds_hdl = ada_info.info.adest_get_hdls.o_mds_pwe1_hdl;
+  mds_adest = ada_info.info.adest_get_hdls.o_adest;
+
+  svc_info.i_mds_hdl = mds_hdl;
+  svc_info.i_svc_id = NCSMDS_SVC_ID_RDE_DISCOVERY;
+  svc_info.i_op = MDS_INSTALL;
+
+  svc_info.info.svc_install.i_yr_svc_hdl = 0;
+  // node specific
+  svc_info.info.svc_install.i_install_scope = NCSMDS_SCOPE_NONE;
+  svc_info.info.svc_install.i_svc_cb = mds_callback; /* callback */
+  svc_info.info.svc_install.i_mds_q_ownership = false;
+  svc_info.info.svc_install.i_mds_svc_pvt_ver = RDE_MDS_PVT_SUBPART_VERSION;
+
+  if (ncsmds_api(&svc_info) == NCSCC_RC_FAILURE) {
+    LOG_ER("%s: MDS Install Failed", __FUNCTION__);
+    return NCSCC_RC_FAILURE;
+  }
+
+  TRACE_LEAVE2("NodeId:%x, mds_adest:%" PRIx64, ncs_get_node_id(), mds_adest);
+
+  return NCSCC_RC_SUCCESS;
+}
+
 uint32_t rde_mds_unregister() {
   NCSMDS_INFO mds_info;
   TRACE_ENTER();
@@ -287,6 +352,27 @@ uint32_t rde_mds_unregister() {
   return rc;
 }
 
+uint32_t rde_discovery_mds_unregister() {
+  NCSMDS_INFO mds_info;
+  TRACE_ENTER();
+
+  /* Un-install your service into MDS.
+   No need to cancel the services that are subscribed */
+  memset(&mds_info, 0, sizeof(NCSMDS_INFO));
+
+  mds_info.i_mds_hdl = mds_hdl;
+  mds_info.i_svc_id = NCSMDS_SVC_ID_RDE_DISCOVERY;
+  mds_info.i_op = MDS_UNINSTALL;
+
+  uint32_t rc = ncsmds_api(&mds_info);
+  if (rc != NCSCC_RC_SUCCESS) {
+    LOG_WA("MDS Unregister Failed");
+  }
+
+  TRACE_LEAVE2("retval = %u", rc);
+  return rc;
+}
+
 uint32_t rde_mds_send(struct rde_msg *msg, MDS_DEST to_dest) {
   NCSMDS_INFO info;
   uint32_t rc;
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index ecab773..a967bd5 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -91,22 +91,26 @@ void Role::MonitorCallback(const std::string& key, const 
std::string& new_value,
   osafassert(status == NCSCC_RC_SUCCESS);
 }
 
-void Role::PromoteNode(const uint64_t cluster_size) {
+void Role::PromoteNode(const uint64_t cluster_size,
+                       const bool relaxed_mode) {
   TRACE_ENTER();
   SaAisErrorT rc;
 
   Consensus consensus_service;
+  bool promotion_pending = false;
 
   rc = consensus_service.PromoteThisNode(true, cluster_size);
-  if (rc != SA_AIS_OK && rc != SA_AIS_ERR_EXIST) {
-    LOG_ER("Unable to set active controller in consensus service");
-    opensaf_reboot(0, nullptr,
-                   "Unable to set active controller in consensus service");
-  }
-
   if (rc == SA_AIS_ERR_EXIST) {
     LOG_WA("Another controller is already active");
     return;
+  } else if (rc != SA_AIS_OK && relaxed_mode == true) {
+    LOG_WA("Unable to set active controller in consensus service");
+    LOG_WA("Will become active anyway");
+    promotion_pending = true;
+  } else if (rc != SA_AIS_OK) {
+    LOG_ER("Unable to set active controller in consensus service");
+    opensaf_reboot(0, nullptr,
+                   "Unable to set active controller in consensus service");
   }
 
   RDE_CONTROL_BLOCK* cb = rde_get_control_block();
@@ -117,9 +121,26 @@ void Role::PromoteNode(const uint64_t cluster_size) {
   uint32_t status;
   status = m_NCS_IPC_SEND(&cb->mbx, msg, NCS_IPC_PRIORITY_HIGH);
   osafassert(status == NCSCC_RC_SUCCESS);
+
+  if (promotion_pending) {
+    osafassert(consensus_service.IsRelaxedNodePromotionEnabled() == true);
+    // the node has been promoted, even though the lock has not been obtained
+    // keep trying the consensus service
+    while (rc != SA_AIS_OK) {
+      rc = consensus_service.PromoteThisNode(true, cluster_size);
+      if (rc == SA_AIS_ERR_EXIST) {
+        LOG_ER("Unable to set active controller in consensus service");
+        opensaf_reboot(0, nullptr,
+                       "Unable to set active controller in consensus service");
+      }
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+    }
+    LOG_NO("Successfully set active controller in consensus service");
+  }
 }
 
 void Role::NodePromoted() {
+  // promoted to active from election
   ExecutePreActiveScript();
   LOG_NO("Switched to ACTIVE from %s", to_string(role()));
   role_ = PCS_RDA_ACTIVE;
@@ -127,6 +148,13 @@ void Role::NodePromoted() {
 
   Consensus consensus_service;
   RDE_CONTROL_BLOCK* cb = rde_get_control_block();
+  if (cb->peer_controllers.empty() == false) {
+    TRACE("Set state to kActiveElectedSeenPeer");
+    cb->state = State::kActiveElectedSeenPeer;
+  } else {
+    TRACE("Set state to kActiveElected");
+    cb->state = State::kActiveElected;
+  }
 
   // register for callback if active controller is changed
   // in consensus service
@@ -161,8 +189,24 @@ timespec* Role::Poll(timespec* ts) {
     } else {
       election_end_time_ = base::kTimespecMax;
       RDE_CONTROL_BLOCK* cb = rde_get_control_block();
-      std::thread(&Role::PromoteNode,
-                 this, cb->cluster_members.size()).detach();
+
+      bool is_candidate = IsCandidate();
+      Consensus consensus_service;
+      if (consensus_service.IsEnabled() == true &&
+        is_candidate == false &&
+        consensus_service.IsWritable() == false) {
+        // node promotion will fail resulting in node reboot,
+        // reset timer and try later
+        TRACE("reset timer and try later");
+        ResetElectionTimer();
+        now = base::ReadMonotonicClock();
+        *ts = election_end_time_ - now;
+        timeout = ts;
+      } else {
+        std::thread(&Role::PromoteNode,
+                    this, cb->cluster_members.size(),
+                    is_candidate).detach();
+      }
     }
   }
   return timeout;
@@ -177,10 +221,42 @@ void Role::ExecutePreActiveScript() {
 
 void Role::AddPeer(NODE_ID node_id) {
   auto result = known_nodes_.insert(node_id);
-  if (result.second) ResetElectionTimer();
+  if (result.second) {
+    ResetElectionTimer();
+  }
+}
+
+// call from main thread only
+bool Role::IsCandidate() {
+  TRACE_ENTER();
+  bool result = false;
+  Consensus consensus_service;
+  RDE_CONTROL_BLOCK* cb = rde_get_control_block();
+
+  // if relaxed node promotion is enabled, allow this node to be promoted
+  // active if it can see a peer SC and this node has the lowest node ID
+  if (consensus_service.IsRelaxedNodePromotionEnabled() == true &&
+      cb->state == State::kNotActiveSeenPeer) {
+    LOG_NO("Relaxed node promotion enabled. This node is a candidate.");
+    result = true;
+  }
+
+  return result;
+}
+
+bool Role::IsPeerPresent() {
+  bool result = false;
+  RDE_CONTROL_BLOCK* cb = rde_get_control_block();
+
+  if (cb->peer_controllers.empty() == false) {
+    result = true;
+  }
+
+  return result;
 }
 
 uint32_t Role::SetRole(PCS_RDA_ROLE new_role) {
+  TRACE_ENTER();
   PCS_RDA_ROLE old_role = role_;
   if (new_role == PCS_RDA_ACTIVE &&
       (old_role == PCS_RDA_UNDEFINED || old_role == PCS_RDA_QUIESCED)) {
@@ -196,6 +272,7 @@ uint32_t Role::SetRole(PCS_RDA_ROLE new_role) {
       // in consensus service
       Consensus consensus_service;
       RDE_CONTROL_BLOCK* cb = rde_get_control_block();
+      cb->state = State::kActiveFailover;
       if (cb->monitor_lock_thread_running == false) {
         cb->monitor_lock_thread_running = true;
         consensus_service.MonitorLock(MonitorCallback, cb->mbx);
diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h
index 9780deb..1920f59 100644
--- a/src/rde/rded/role.h
+++ b/src/rde/rded/role.h
@@ -34,6 +34,8 @@ class Role {
  public:
   explicit Role(NODE_ID own_node_id);
   void AddPeer(NODE_ID node_id);
+  bool IsCandidate();
+  bool IsPeerPresent();
   void SetPeerState(PCS_RDA_ROLE node_role, NODE_ID node_id);
   timespec* Poll(timespec* ts);
   uint32_t SetRole(PCS_RDA_ROLE new_role);
@@ -49,7 +51,7 @@ class Role {
   void ExecutePreActiveScript();
   void ResetElectionTimer();
   uint32_t UpdateMdsRegistration(PCS_RDA_ROLE new_role, PCS_RDA_ROLE old_role);
-  void PromoteNode(const uint64_t cluster_size);
+  void PromoteNode(const uint64_t cluster_size, const bool relaxed_mode);
 
   std::set<NODE_ID> known_nodes_;
   PCS_RDA_ROLE role_;
-- 
2.7.4



_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to