00-README.conf | 47 +++++++++
osaf/services/infrastructure/fm/config/fmd.conf | 9 +-
osaf/services/infrastructure/fm/fms/Makefile.am | 3 +-
osaf/services/infrastructure/fm/fms/fm_cb.h | 4 +
osaf/services/infrastructure/fm/fms/fm_main.c | 124 +++++++++++++++++++++++-
scripts/opensaf_reboot | 47 ++++++--
6 files changed, 216 insertions(+), 18 deletions(-)
diff --git a/00-README.conf b/00-README.conf
--- a/00-README.conf
+++ b/00-README.conf
@@ -530,3 +530,50 @@ and not access any of its members direct
saAisNameBorrow() access functions shall be used. The
SA_MAX_UNEXTENDED_NAME_LENGTH constant can be used to refer to the maximum
string length that can be stored in the unextended SaNameT type.
+
+Configuring remote fencing support using STONITH
+================================================
+
+In an virtualized enironment STONITH can be used to for remote fencing the
other
+system controller in case of "link loss" or the peer system controller is
"live hanging",
+this to avoid split-brains.
+Node self-fencing will also be used if e.g. the active controller loses
connectivity to
+all other nodes in the cluster.
+
+Example installing on using Ubuntu 14.04,
+
+On each virtual node install stonith package:
+
+ sudo apt-get install cluster-glue
+
+The name of each virtual node should be the same as the clm node name,
+e.g. safNode=SC-2,safCluster=myClmCluster the virtual node name should be SC-2.
+
+If a firewall is used on the "hypervisor" host, the tcp port 16509
+has to be added. If ssh is used use ssh-keygen and generate ssh keys for each
+virtual node.
+
+To verify the installation virsh can be used, e.g:
+virsh --connect=qemu+tcp://192.168.122.1/system list --all
+
+Example of output:
+Id Name State
+----------------------------------------------------
+ 2 SC-1 running
+ 3 SC-2 running
+ 4 PL-3 running
+
+Update the fmd.conf file:
+
+# The Promote active timer is set to delay the Standby controllers reboot
request,
+# as the Active controller probably also are requesting reboot of the standby.
+# The resolution is in 10 ms units.
+export FMS_PROMOTE_ACTIVE_TIMER=300
+
+# Uncomment the next 5 lines and update acordingly to enable remote fencing
+# See also documentation for STONITH
+export FMS_USE_REMOTE_FENCING=1
+export FMS_FENCE_CMD="stonith"
+export FMS_DEVICE_TYPE="external/libvirt"
+export FMS_HYPERVISOR_URI="qemu+tcp://192.168.122.1/system"
+export FMS_FENCE_ACTION="reset"
diff --git a/osaf/services/infrastructure/fm/config/fmd.conf
b/osaf/services/infrastructure/fm/config/fmd.conf
--- a/osaf/services/infrastructure/fm/config/fmd.conf
+++ b/osaf/services/infrastructure/fm/config/fmd.conf
@@ -17,7 +17,14 @@ export FM_CONTROLLER2_SUBSLOT=15
export FMS_HA_ENV_HEALTHCHECK_KEY="Default"
# Promote active timer
-export FMS_PROMOTE_ACTIVE_TIMER=0
+export FMS_PROMOTE_ACTIVE_TIMER=500
+
+# Uncomment the next 5 lines and update acordingly to enable remote fencing
+export FMS_USE_REMOTE_FENCING=1
+export FMS_FENCE_CMD="stonith"
+export FMS_DEVICE_TYPE="external/libvirt"
+export FMS_HYPERVISOR_URI="qemu+tcp://192.168.122.1/system"
+export FMS_FENCE_ACTION="reset"
# FM will supervise transitions to the ACTIVE role when this variable is set to
# a non-zero value. The value is the time in the unit of 10 ms to wait for a
diff --git a/osaf/services/infrastructure/fm/fms/Makefile.am
b/osaf/services/infrastructure/fm/fms/Makefile.am
--- a/osaf/services/infrastructure/fm/fms/Makefile.am
+++ b/osaf/services/infrastructure/fm/fms/Makefile.am
@@ -46,4 +46,5 @@ osaffmd_SOURCES = \
osaffmd_LDADD = \
$(top_builddir)/osaf/libs/core/libopensaf_core.la \
$(top_builddir)/osaf/libs/saf/libSaAmf/libSaAmf.la \
- $(top_builddir)/osaf/libs/agents/infrastructure/rda/librda.la
+ $(top_builddir)/osaf/libs/agents/infrastructure/rda/librda.la \
+ $(top_builddir)/osaf/libs/saf/libSaClm/libSaClm.la
diff --git a/osaf/services/infrastructure/fm/fms/fm_cb.h
b/osaf/services/infrastructure/fm/fms/fm_cb.h
--- a/osaf/services/infrastructure/fm/fms/fm_cb.h
+++ b/osaf/services/infrastructure/fm/fms/fm_cb.h
@@ -26,6 +26,7 @@
#include "mds_papi.h"
#include "rda_papi.h"
#include "fm_amf.h"
+#include "saClm.h"
#include <stdbool.h>
#include <stdint.h>
@@ -102,6 +103,9 @@ typedef struct fm_cb {
uint64_t cluster_size;
struct timespec last_well_connected;
struct timespec node_isolation_timeout;
+ SaClmHandleT clm_hdl;
+ bool use_remote_fencing;
+ SaNameT peer_clm_node_name;
} FM_CB;
extern char *role_string[];
diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c
b/osaf/services/infrastructure/fm/fms/fm_main.c
--- a/osaf/services/infrastructure/fm/fms/fm_main.c
+++ b/osaf/services/infrastructure/fm/fms/fm_main.c
@@ -32,6 +32,13 @@ This file contains the main() routine fo
#include "fm.h"
#include "osaf_time.h"
+#define FM_CLM_API_TIMEOUT 10000000000LL
+
+static SaVersionT clm_version = { 'B', 4, 1 };
+static const SaClmCallbacksT_4 clm_callbacks = {
+ 0, 0
+};
+
enum {
FD_TERM = 0,
FD_AMF = 1,
@@ -54,6 +61,8 @@ static uint32_t fm_get_args(FM_CB *);
static uint32_t fms_fms_exchange_node_info(FM_CB *);
static uint32_t fm_nid_notify(uint32_t);
static uint32_t fm_tmr_start(FM_TMR *, SaTimeT);
+static SaAisErrorT get_peer_clm_node_name(NODE_ID);
+static SaAisErrorT fm_clm_init();
static void fm_mbx_msg_handler(FM_CB *, FM_EVT *);
static void fm_evt_proc_rda_callback(FM_CB*, FM_EVT*);
static void fm_tmr_exp(void *);
@@ -313,6 +322,8 @@ uint32_t initialize_for_assignment(FM_CB
LOG_ER("immd_mds_register FAILED %d", rc);
goto done;
}
+
+ cb->clm_hdl = 0;
cb->fully_initialized = true;
done:
TRACE_LEAVE2("rc = %u", rc);
@@ -383,8 +394,17 @@ static uint32_t fm_agents_startup(void)
*****************************************************************************/
static uint32_t fm_get_args(FM_CB *fm_cb)
{
+ char *use_remote_fencing = NULL;
char *value;
TRACE_ENTER();
+
+ fm_cb->use_remote_fencing = false;
+ use_remote_fencing = getenv("FMS_USE_REMOTE_FENCING");
+ if (use_remote_fencing != NULL) {
+ fm_cb->use_remote_fencing = true;
+ LOG_NO("Remote fencing is enabled");
+ }
+
value = getenv("EE_ID");
if (value != NULL) {
fm_cb->node_name.length = strlen(value);
@@ -474,6 +494,85 @@ void fm_proc_svc_down(FM_CB *cb, FM_EVT
}
/****************************************************************************
+* Name : fm_clm_init
+*
+* Description : Initialize CLM.
+*
+* Arguments : None.
+*
+* Return Values : None.
+*
+* Notes : None.
+*****************************************************************************/
+static SaAisErrorT get_peer_clm_node_name(NODE_ID node_id)
+{
+ SaAisErrorT rc = SA_AIS_OK;
+ SaClmClusterNodeT_4 cluster_node;
+
+ if ((rc = fm_clm_init()) != SA_AIS_OK) {
+ LOG_ER("clm init FAILED %d", rc);
+ } else {
+ LOG_NO("clm init OK");
+ }
+
+ if ((rc = saClmClusterNodeGet_4(fm_cb->clm_hdl, node_id,
FM_CLM_API_TIMEOUT, &cluster_node)) == SA_AIS_OK) {
+ // Extract peer clm node name, e.g SC-2 from
"safNode=SC-2,safCluster=myClmCluster"
+ // The peer clm node name will be passed to opensaf_reboot
script to support remote fencing.
+ // The peer clm node name should correspond to the name of the
virtual machine for that node.
+ char *node = NULL;
+ strtok((char*) cluster_node.nodeName.value, "=");
+ node = strtok(NULL, ",");
+ strncpy((char*) fm_cb->peer_clm_node_name.value, node,
cluster_node.nodeName.length);
+ LOG_NO("Peer clm node name: %s",
fm_cb->peer_clm_node_name.value);
+ } else {
+ LOG_WA("saClmClusterNodeGet_4 returned %u", (unsigned) rc);
+ }
+
+ if ((rc = saClmFinalize(fm_cb->clm_hdl)) != SA_AIS_OK) {
+ LOG_ER("clm finalize FAILED %d", rc);
+ }
+
+ return rc;
+}
+
+/****************************************************************************
+* Name : fm_clm_init
+*
+* Description : Initialize CLM.
+*
+* Arguments : None.
+*
+* Return Values : None.
+*
+* Notes : None.
+*****************************************************************************/
+static SaAisErrorT fm_clm_init()
+{
+ SaAisErrorT rc = SA_AIS_OK;
+
+ for (;;) {
+ rc = saClmInitialize_4(&fm_cb->clm_hdl, &clm_callbacks,
&clm_version);
+ if (rc == SA_AIS_ERR_TRY_AGAIN ||
+ rc == SA_AIS_ERR_TIMEOUT ||
+ rc == SA_AIS_ERR_UNAVAILABLE) {
+ LOG_WA("saClmInitialize_4 returned %u", (unsigned) rc);
+
+ if (rc != SA_AIS_ERR_TRY_AGAIN) {
+ LOG_WA("saClmInitialize_4 returned %u",
+ (unsigned) rc);
+ }
+ osaf_nanosleep(&kHundredMilliseconds);
+ continue;
+ }
+ if (rc == SA_AIS_OK) break;
+ LOG_ER("Failed to Initialize with CLM: %u", rc);
+ goto done;
+ }
+done:
+ return rc;
+}
+
+/****************************************************************************
* Name : fm_mbx_msg_handler
*
* Description : Processes Mail box messages between FM.
@@ -517,8 +616,13 @@ static void fm_mbx_msg_handler(FM_CB *fm
* but just that failover has been
trigerred quicker than the
* node_down event has been received.
*/
- opensaf_reboot(fm_cb->peer_node_id, (char
*)fm_cb->peer_node_name.value,
- "Received Node Down for peer
controller");
+ if (fm_cb->use_remote_fencing) {
+ opensaf_reboot(fm_cb->peer_node_id,
(char *)fm_cb->peer_clm_node_name.value,
+ "Received Node Down for
peer controller");
+ } else {
+ opensaf_reboot(fm_cb->peer_node_id,
(char *)fm_cb->peer_node_name.value,
+ "Received Node Down for
peer controller");
+ }
if (!((fm_cb->role == PCS_RDA_ACTIVE) &&
(fm_cb->amf_state == (SaAmfHAStateT)PCS_RDA_ACTIVE))) {
fm_cb->role = PCS_RDA_ACTIVE;
LOG_NO("Controller Failover: Setting
role to ACTIVE");
@@ -534,6 +638,10 @@ static void fm_mbx_msg_handler(FM_CB *fm
/* Peer fm came up so sending ee_id of this node */
if (fm_cb->node_name.length != 0)
fms_fms_exchange_node_info(fm_cb);
+
+ if (fm_cb->use_remote_fencing) {
+ get_peer_clm_node_name(fm_mbx_evt->node_id);
+ }
break;
case FM_EVT_TMR_EXP:
/* Timer Expiry event posted */
@@ -547,8 +655,16 @@ static void fm_mbx_msg_handler(FM_CB *fm
fm_cb->role = PCS_RDA_ACTIVE;
LOG_NO("Reseting peer controller node id: %x",
fm_cb->peer_node_id);
- opensaf_reboot(fm_cb->peer_node_id, (char
*)fm_cb->peer_node_name.value,
- "Received Node Down for Active peer");
+ if (fm_cb->use_remote_fencing) {
+ LOG_NO("saClmClusterNodeGet succeeded node_id
0x%X, clm peer node name %s",
+ fm_mbx_evt->node_id,
fm_cb->peer_clm_node_name.value);
+
+ opensaf_reboot(fm_cb->peer_node_id, (char
*)fm_cb->peer_clm_node_name.value,
+ "Received Node Down for peer
controller");
+ } else {
+ opensaf_reboot(fm_cb->peer_node_id, (char
*)fm_cb->peer_node_name.value,
+ "Received Node Down for Active
peer");
+ }
fm_rda_set_role(fm_cb, PCS_RDA_ACTIVE);
} else if (fm_mbx_evt->info.fm_tmr->type ==
FM_TMR_ACTIVATION_SUPERVISION) {
opensaf_reboot(0, NULL, "Activation timer supervision "
diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot
--- a/scripts/opensaf_reboot
+++ b/scripts/opensaf_reboot
@@ -26,13 +26,31 @@
# through proprietary mechanisms, i.e. not through PLM. Node_id is (the only
# entity) at the disposal of such a mechanism.
+if [ -f "$pkgsysconfdir/fmd.conf" ]; then
+ . "$pkgsysconfdir/fmd.conf"
+fi
+
NODE_ID_FILE=$pkglocalstatedir/node_id
+
node_id=$1
ee_name=$2
# Run commands through sudo when not superuser
test $(id -u) -ne 0 && icmd=$(which sudo 2> /dev/null)
+## Use stonith for remote fencing
+opensaf_reboot_with_remote_fencing()
+{
+ "$FMS_FENCE_CMD" -t "$FMS_DEVICE_TYPE" hostlist="node:$ee_name"
hypervisor_uri="$FMS_HYPERVISOR_URI" -T "$FMS_FENCE_ACTION" node
+
+ retval=$?
+ if [ $retval != 0 ]; then
+ logger -t "opensaf_reboot" "Rebooting remote node $ee_name
using $FMS_FENCE_CMD failed, rc: $retval"
+ exit 1
+ fi
+}
+
+
#if plm exists in the system,then the reboot is performed using the eename.
opensaf_reboot_with_plm()
{
@@ -86,17 +104,22 @@ if [ "$self_node_id" = "$node_id" ] || [
# Reboot (not shutdown) system WITH file system sync
$icmd /sbin/reboot -f
else
- if [ ":$ee_name" != ":" ]; then
- plm_node_presence_state=`immlist $ee_name |grep
saPlmEEPresenceState|awk '{print $3}'`
- plm_node_state=`immlist $ee_name |grep saPlmEEAdminState|awk
'{print $3}'`
- if [ "$plm_node_presence_state" != 3 ] ; then
- logger -t "opensaf_reboot" "Not rebooting remote node
$ee_name as it is not in INSTANTIATED state"
- elif [ $plm_node_state != 2 ]; then
- opensaf_reboot_with_plm
- else
- logger -t "opensaf_reboot" "Not rebooting remote node
$ee_name as it is already in locked state"
+ if [ "$FMS_USE_REMOTE_FENCING" = "1" ]; then
+ opensaf_reboot_with_remote_fencing
+ else
+ if [ ":$ee_name" != ":" ]; then
+
+ plm_node_presence_state=`immlist $ee_name |grep
saPlmEEPresenceState|awk '{print $3}'`
+ plm_node_state=`immlist $ee_name |grep
saPlmEEAdminState|awk '{print $3}'`
+ if [ "$plm_node_presence_state" != 3 ] ; then
+ logger -t "opensaf_reboot" "Not rebooting
remote node $ee_name as it is not in INSTANTIATED state"
+ elif [ $plm_node_state != 2 ]; then
+ opensaf_reboot_with_plm
+ else
+ logger -t "opensaf_reboot" "Not rebooting
remote node $ee_name as it is already in locked state"
+ fi
+ else
+ logger -t "opensaf_reboot" "Rebooting remote node in
the absence of PLM is outside the scope of OpenSAF"
fi
- else
- logger -t "opensaf_reboot" "Rebooting remote node in the
absence of PLM is outside the scope of OpenSAF"
- fi
+ fi
fi
------------------------------------------------------------------------------
Attend Shape: An AT&T Tech Expo July 15-16. Meet us at AT&T Park in San
Francisco, CA to explore cutting-edge tech and listen to tech luminaries
present their vision of the future. This family event has something for
everyone, including kids. Get more information and register today.
http://sdm.link/attshape
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel