---
 src/fm/fmd/fm_cb.h    |  2 +-
 src/fm/fmd/fm_main.cc | 26 +++++---------------------
 src/fm/fmd/fm_mds.cc  |  2 ++
 src/fm/fmd/fm_rda.cc  | 27 ++++++++++++++++++++++-----
 4 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/src/fm/fmd/fm_cb.h b/src/fm/fmd/fm_cb.h
index cfa50d883..010ab735a 100644
--- a/src/fm/fmd/fm_cb.h
+++ b/src/fm/fmd/fm_cb.h
@@ -100,7 +100,7 @@ struct FM_CB {
 
   std::atomic<bool> peer_sc_up{false};
   bool well_connected{false};
-  uint64_t cluster_size{};
+  std::atomic<uint64_t> cluster_size{};
   struct timespec last_well_connected{};
   struct timespec node_isolation_timeout{};
   SaClmHandleT clm_hdl{};
diff --git a/src/fm/fmd/fm_main.cc b/src/fm/fmd/fm_main.cc
index 73c9b9ccd..3371ec5e8 100644
--- a/src/fm/fmd/fm_main.cc
+++ b/src/fm/fmd/fm_main.cc
@@ -551,21 +551,12 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT 
*fm_mbx_evt) {
            * trigerred quicker than the node_down event
            * has been received.
            */
-          if (fm_cb->role == PCS_RDA_STANDBY) {
-            const std::string current_active =
-                consensus_service.CurrentActive();
-            if (current_active.compare(osaf_extended_name_borrow(
-                    &fm_cb->peer_clm_node_name)) == 0) {
-              // update consensus service, before fencing old active controller
-              consensus_service.DemoteCurrentActive();
-            }
-          }
 
           if (fm_cb->use_remote_fencing) {
             if (fm_cb->peer_node_terminated == false) {
               // if peer_sc_up is true then
               // the node has come up already
-              if (fm_cb->peer_sc_up == false && fm_cb->immnd_down == true) {
+              if (consensus_service.IsEnabled() == false) {
                 opensaf_reboot(fm_cb->peer_node_id,
                                (char *)fm_cb->peer_clm_node_name.value,
                                "Received Node Down for peer controller");
@@ -580,8 +571,7 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT 
*fm_mbx_evt) {
             fm_cb->mutex_.Lock();
             peer_node_name = fm_cb->peer_node_name;
             fm_cb->mutex_.Unlock();
-            opensaf_reboot(fm_cb->peer_node_id,
-                           peer_node_name.c_str(),
+            opensaf_reboot(fm_cb->peer_node_id, peer_node_name.c_str(),
                            "Received Node Down for peer controller");
           }
           if (!((fm_cb->role == PCS_RDA_ACTIVE) &&
@@ -632,12 +622,6 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT 
*fm_mbx_evt) {
         }
 
         Consensus consensus_service;
-        const std::string current_active = consensus_service.CurrentActive();
-        if (current_active.compare(
-                osaf_extended_name_borrow(&fm_cb->peer_clm_node_name)) == 0) {
-          // update consensus service, before fencing old active controller
-          consensus_service.DemoteCurrentActive();
-        }
 
         /* Now. Try resetting other blade */
         fm_cb->role = PCS_RDA_ACTIVE;
@@ -645,7 +629,8 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT 
*fm_mbx_evt) {
         LOG_NO("Reseting peer controller node id: %x",
                unsigned(fm_cb->peer_node_id));
         if (fm_cb->use_remote_fencing) {
-          if (fm_cb->peer_node_terminated == false) {
+          if (fm_cb->peer_node_terminated == false &&
+              consensus_service.IsEnabled() == false) {
             opensaf_reboot(fm_cb->peer_node_id,
                            (char *)fm_cb->peer_clm_node_name.value,
                            "Received Node Down for peer controller");
@@ -658,8 +643,7 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT 
*fm_mbx_evt) {
           fm_cb->mutex_.Lock();
           peer_node_name = fm_cb->peer_node_name;
           fm_cb->mutex_.Unlock();
-          opensaf_reboot(fm_cb->peer_node_id,
-                         peer_node_name.c_str(),
+          opensaf_reboot(fm_cb->peer_node_id, peer_node_name.c_str(),
                          "Received Node Down for Active peer");
         }
         fm_rda_set_role(fm_cb, PCS_RDA_ACTIVE);
diff --git a/src/fm/fmd/fm_mds.cc b/src/fm/fmd/fm_mds.cc
index 277a357d2..60db5dab1 100644
--- a/src/fm/fmd/fm_mds.cc
+++ b/src/fm/fmd/fm_mds.cc
@@ -373,6 +373,7 @@ static uint32_t fm_mds_node_evt(FM_CB *cb,
     case NCSMDS_NODE_DOWN:
       if (cb->cluster_size != 0) {
         --cb->cluster_size;
+        TRACE("cluster_size %llu", (unsigned long long)cb->cluster_size);
         TRACE("Node down event for node id %x, cluster size is now: %llu",
               node_evt->node_id, (unsigned long long)cb->cluster_size);
         check_for_node_isolation(cb);
@@ -397,6 +398,7 @@ static uint32_t fm_mds_node_evt(FM_CB *cb,
 
     case NCSMDS_NODE_UP:
       ++cb->cluster_size;
+      TRACE("cluster_size %llu", (unsigned long long)cb->cluster_size);
       TRACE("Node up event for node id %x, cluster size is now: %llu",
             node_evt->node_id, (unsigned long long)cb->cluster_size);
       check_for_node_isolation(cb);
diff --git a/src/fm/fmd/fm_rda.cc b/src/fm/fmd/fm_rda.cc
index 47e1f1d32..af337f868 100644
--- a/src/fm/fmd/fm_rda.cc
+++ b/src/fm/fmd/fm_rda.cc
@@ -87,11 +87,28 @@ uint32_t fm_rda_set_role(FM_CB *fm_cb, PCS_RDA_ROLE role) {
   osafassert(role == PCS_RDA_ACTIVE);
 
   Consensus consensus_service;
-  rc = consensus_service.PromoteThisNode();
-  if (rc != SA_AIS_OK) {
-    LOG_ER("Unable to set active controller in consensus service");
-    opensaf_reboot(0, nullptr,
-                   "Unable to set active controller in consensus service");
+  if (consensus_service.IsEnabled() == true) {
+    // Allow topology events to be processed first. The MDS thread may
+    // be processing MDS down events and updating cluster_size concurrently.
+    // We need cluster_size to be as accurate as possible, without waiting
+    // too long for node down events.
+    std::this_thread::sleep_for(std::chrono::seconds(3));
+
+    rc = consensus_service.PromoteThisNode(true, fm_cb->cluster_size);
+    if (rc != SA_AIS_OK && rc != SA_AIS_ERR_EXIST) {
+      LOG_ER("Unable to set active controller in consensus service");
+      opensaf_reboot(0, nullptr,
+                     "Unable to set active controller in consensus service");
+    } else if (rc == SA_AIS_ERR_EXIST) {
+      // @todo if we don't reboot, we don't seem to recover from this. Can we
+      // improve?
+      LOG_ER(
+          "A controller is already active. We were separated from the "
+          "cluster?");
+      opensaf_reboot(0, nullptr,
+                     "A controller is already active. We were separated "
+                     "from the cluster?");
+    }
   }
 
   rc = pcs_rda_request(&rda_req);
-- 
2.14.1


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to