Ack for the patch.

regards,
Anders Widell

On 01/23/2018 09:06 AM, Gary Lee wrote:
When a node goes down and split-brain prevention is enabled,
check that we still have write access to the consensus service.
If not and fencing is disabled, reboot the node to prevent
split brain.
---
  src/amf/amfd/ndproc.cc    | 12 +++++++++++-
  src/amf/amfd/osaf-amfd.in |  4 ++++
  src/amf/amfd/role.cc      | 30 +++++++++++++++++++++++++-----
  3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc
index 0c6316627..df68b3dbf 100644
--- a/src/amf/amfd/ndproc.cc
+++ b/src/amf/amfd/ndproc.cc
@@ -32,8 +32,8 @@
   */
#include "osaf/immutil/immutil.h"
+#include "osaf/consensus/service.h"
  #include "base/logtrace.h"
-
  #include "amf/amfd/amfd.h"
  #include "amf/amfd/imm.h"
  #include "amf/amfd/cluster.h"
@@ -1221,5 +1221,15 @@ void avd_node_failover(AVD_AVND *node) {
    avd_pg_node_csi_del_all(avd_cb, node);
    avd_node_down_mw_susi_failover(avd_cb, node);
    avd_node_down_appl_susi_failover(avd_cb, node);
+
+  Consensus consensus_service;
+  if (consensus_service.IsRemoteFencingEnabled() == false &&
+      consensus_service.IsWritable() == false) {
+    // remote fencing is disabled and we have lost write access
+    // reboot this node to prevent split brain
+    opensaf_reboot(0, nullptr,
+      "Quorum lost. Rebooting this node to prevent split-brain");
+  }
+
    TRACE_LEAVE();
  }
diff --git a/src/amf/amfd/osaf-amfd.in b/src/amf/amfd/osaf-amfd.in
index 45c5ab9e4..26a77ef52 100644
--- a/src/amf/amfd/osaf-amfd.in
+++ b/src/amf/amfd/osaf-amfd.in
@@ -28,6 +28,10 @@ else
        . $pkgsysconfdir/amfd.conf
  fi    
+if [ -f "$pkgsysconfdir/fmd.conf" ]; then
+  . "$pkgsysconfdir/fmd.conf"
+fi
+
  binary=$pkglibdir/$osafprog
  pidfile=$pkgpiddir/$osafprog.pid
  lockfile=$lockdir/$initscript
diff --git a/src/amf/amfd/role.cc b/src/amf/amfd/role.cc
index 865d89d94..862ac3653 100644
--- a/src/amf/amfd/role.cc
+++ b/src/amf/amfd/role.cc
@@ -38,6 +38,7 @@
  #include "osaf/immutil/immutil.h"
  #include "base/logtrace.h"
  #include "rde/agent/rda_papi.h"
+#include "osaf/consensus/service.h"
#include "amf/amfd/amfd.h"
  #include "amf/amfd/imm.h"
@@ -1085,6 +1086,12 @@ uint32_t amfd_switch_actv_qsd(AVD_CL_CB *cb) {
      avd_d2n_msg_dequeue(cb);
    }
+ Consensus consensus_service;
+  rc = consensus_service.DemoteThisNode();
+  if (rc != SA_AIS_OK) {
+    LOG_ER("Failed to demote this node from consensus service");
+  }
+
    TRACE_LEAVE();
    return NCSCC_RC_SUCCESS;
  }
@@ -1209,13 +1216,21 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) {
    cb->avail_state_avd = SA_AMF_HA_ACTIVE;
    osaf_mutex_unlock_ordie(&imm_reinit_mutex);
+ Consensus consensus_service;
+  rc = consensus_service.PromoteThisNode();
+  if (rc != SA_AIS_OK) {
+    LOG_ER("Unable to set active controller in consensus service");
+    osafassert(false);
+  }
+
    /* Declare this standby as Active. Set Vdest role role */
    if (NCSCC_RC_SUCCESS !=
        (status = avd_mds_set_vdest_role(cb, SA_AMF_HA_ACTIVE))) {
      LOG_ER("Switch Standby --> Active FAILED, MDS role set failed");
      cb->swap_switch = false;
      avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE);
-    return NCSCC_RC_FAILURE;
+    status = NCSCC_RC_FAILURE;
+    goto done;
    }
/* Time to send fail-over messages to all the AVND's */
@@ -1240,7 +1255,8 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) {
      } else {
        cb->swap_switch = false;
        avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE);
-      return NCSCC_RC_FAILURE;
+      status = NCSCC_RC_FAILURE;
+      goto done;
      }
    }
@@ -1259,7 +1275,8 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) {
           in avd_imm_reinit_bg_thread.*/
      } else {
        avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE);
-      return NCSCC_RC_FAILURE;
+      status = NCSCC_RC_FAILURE;
+      goto done;
      }
    } else
      osaf_mutex_unlock_ordie(&imm_reinit_mutex);
@@ -1274,7 +1291,8 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) {
      LOG_ER("Switch Standby --> Active, clm track start failed");
      Fifo::queue(new ClmTrackStart());
      avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE);
-    return NCSCC_RC_FAILURE;
+    status = NCSCC_RC_FAILURE;
+    goto done;
    }
/* Send the message to other avd for role change rsp as success */
@@ -1291,8 +1309,10 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) {
      }
    }
+ status = NCSCC_RC_SUCCESS;
+done:
    TRACE_LEAVE();
-  return NCSCC_RC_SUCCESS;
+  return status;
  }
/****************************************************************************\


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to