Ack, not tested. A comment is that I believe this works out of pure luck: both FM and CLM act on node down or service down events, and by the time FM acts, CLM has already removed the node from cluster membership. I am not sure if it is guaranteed to work like this in the case of running /etc/init.d/opensafd stop, and in the opposite way in case of node failures. I ack the code anyway since I see this as a temporary implementation. When we implement the "enhanced cluster management" feature, we will anyhow have to revisit this (and most likely re-write both FM and CLM..).
regards, Anders Widell On 11/04/2016 01:59 PM, Hans Nordeback wrote: > osaf/services/infrastructure/fm/fms/Makefile.am | 6 +- > osaf/services/infrastructure/fm/fms/fm_main.c | 114 > ++++++++++++++++++++--- > 2 files changed, 104 insertions(+), 16 deletions(-) > > > diff --git a/osaf/services/infrastructure/fm/fms/Makefile.am > b/osaf/services/infrastructure/fm/fms/Makefile.am > --- a/osaf/services/infrastructure/fm/fms/Makefile.am > +++ b/osaf/services/infrastructure/fm/fms/Makefile.am > @@ -33,7 +33,8 @@ noinst_HEADERS = \ > > osaffmd_CPPFLAGS= \ > $(AM_CPPFLAGS) \ > - -I$(top_srcdir)/osaf/services/infrastructure/fm/include > + -I$(top_srcdir)/osaf/services/infrastructure/fm/include \ > + -I$(top_srcdir)/osaf/libs/common/immsv/include > > osaffmd_CFLAGS = $(AM_CFLAGS) > > @@ -44,6 +45,9 @@ osaffmd_SOURCES = \ > fm_amf.c > > osaffmd_LDADD = \ > + $(top_builddir)/osaf/tools/safimm/src/libimmutil.la \ > + $(top_builddir)/osaf/libs/saf/libSaImm/libSaImmOi.la \ > + $(top_builddir)/osaf/libs/saf/libSaImm/libSaImmOm.la \ > $(top_builddir)/osaf/libs/core/libopensaf_core.la \ > $(top_builddir)/osaf/libs/saf/libSaAmf/libSaAmf.la \ > $(top_builddir)/osaf/libs/agents/infrastructure/rda/librda.la \ > diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c > b/osaf/services/infrastructure/fm/fms/fm_main.c > --- a/osaf/services/infrastructure/fm/fms/fm_main.c > +++ b/osaf/services/infrastructure/fm/fms/fm_main.c > @@ -31,6 +31,7 @@ This file contains the main() routine fo > #include <nid_api.h> > #include "fm.h" > #include "osaf_time.h" > +#include "immutil.h" > > #define FM_CLM_API_TIMEOUT 10000000000LL > > @@ -62,6 +63,8 @@ static uint32_t fms_fms_exchange_node_in > static uint32_t fm_nid_notify(uint32_t); > static uint32_t fm_tmr_start(FM_TMR *, SaTimeT); > static SaAisErrorT get_peer_clm_node_name(NODE_ID); > +static const char* get_clm_node_name(const SaNameT* node_name); > +static bool is_node_clm_member(const SaNameT *clm_node_name); > static SaAisErrorT fm_clm_init(); > static void fm_mbx_msg_handler(FM_CB *, FM_EVT *); > static void fm_evt_proc_rda_callback(FM_CB*, FM_EVT*); > @@ -499,6 +502,30 @@ void fm_proc_svc_down(FM_CB *cb, FM_EVT > } > > > /**************************************************************************** > +* Name : get_node_name > +* > +* Description : Extract node name from DN clm node. > +* > +* Arguments : DN clm node name. > +* > +* Return Values : Extrated node name. > +* > +* Notes : None. > +*****************************************************************************/ > +static const char* get_clm_node_name(const SaNameT* node_name){ > + SaNameT tmp_node_name = *node_name; > + char *save_ptr; > + // Extract peer clm node name, e.g SC-2 from > "safNode=SC-2,safCluster=myClmCluster" > + // The peer clm node name will be passed to opensaf_reboot script to > support remote fencing. > + // The peer clm node name should correspond to the name of the virtual > machine for that node. > + strtok_r((char*) tmp_node_name.value, "=", &save_ptr); > + char *node = strtok_r(NULL, ",", &save_ptr); > + char *tmp = strndup(node, strlen(node)); > + LOG_NO("Peer clm node name: %s", tmp); > + return tmp; > +} > + > +/**************************************************************************** > * Name : fm_clm_init > * > * Description : Initialize CLM. > @@ -521,16 +548,10 @@ static SaAisErrorT get_peer_clm_node_nam > } > > if ((rc = saClmClusterNodeGet_4(fm_cb->clm_hdl, node_id, > FM_CLM_API_TIMEOUT, &cluster_node)) == SA_AIS_OK) { > - // Extract peer clm node name, e.g SC-2 from > "safNode=SC-2,safCluster=myClmCluster" > - // The peer clm node name will be passed to opensaf_reboot > script to support remote fencing. > - // The peer clm node name should correspond to the name of the > virtual machine for that node. > - char *node = NULL; > - strtok((char*) cluster_node.nodeName.value, "="); > - node = strtok(NULL, ","); > - strncpy((char*) fm_cb->peer_clm_node_name.value, node, > cluster_node.nodeName.length); > + fm_cb->peer_clm_node_name = cluster_node.nodeName; > LOG_NO("Peer clm node name: %s", > fm_cb->peer_clm_node_name.value); > } else { > - LOG_WA("saClmClusterNodeGet_4 returned %u", (unsigned) rc); > + LOG_WA("saClmClusterNodeGet_4 returned %d", rc); > } > > if ((rc = saClmFinalize(fm_cb->clm_hdl)) != SA_AIS_OK) { > @@ -551,6 +572,58 @@ static SaAisErrorT get_peer_clm_node_nam > * > * Notes : None. > > *****************************************************************************/ > +static bool is_node_clm_member(const SaNameT *clm_node_name) > +{ > + SaAisErrorT rc = SA_AIS_OK; > + SaUint32T clm_member = 0; > + SaNameT node_name = *clm_node_name; > + > + SaVersionT immVersion = { 'A', 2, 15 }; > + const SaImmAttrValuesT_2 **attributes; > + SaImmAccessorHandleT accessor_handle; > + SaImmHandleT om_handle; > + > + if ((rc = immutil_saImmOmInitialize(&om_handle, NULL, &immVersion)) != > SA_AIS_OK) { > + LOG_ER("saImmOmInitialize FAILED: %u", rc); > + goto done; > + } > + > + if ((rc = immutil_saImmOmAccessorInitialize(om_handle, > &accessor_handle)) != SA_AIS_OK) { > + LOG_ER("saImmOmAccessorInitialize FAILED: %u", rc); > + goto om_finalize; > + } > + > + if ((rc = immutil_saImmOmAccessorGet_2(accessor_handle, &node_name, > NULL, (SaImmAttrValuesT_2 ***) &attributes)) != SA_AIS_OK) { > + LOG_ER("saImmOmAccessorGet_2 FAILED: %s %u ", node_name.value, > rc); > + goto accessor_finalize; > + } > + > + if ((rc = immutil_getAttr("saClmNodeIsMember", attributes, 0, > &clm_member)) != SA_AIS_OK) { > + LOG_ER("immutil_getAttr FAILED: %u", rc); > + } > +accessor_finalize: > + if ((rc = immutil_saImmOmAccessorFinalize(accessor_handle)) != > SA_AIS_OK) { > + LOG_NO("immutil_saImmOmAccessorFinalize FAILED: %u", rc); > + } > +om_finalize: > + if ((rc = immutil_saImmOmFinalize(om_handle)) != SA_AIS_OK) { > + LOG_NO("immutil_saImmOmFinalize FAILED: %u", rc); > + } > +done: > + return (clm_member == 1) ? true : false; > +} > + > +/**************************************************************************** > +* Name : fm_clm_init > +* > +* Description : Initialize CLM. > +* > +* Arguments : None. > +* > +* Return Values : None. > +* > +* Notes : None. > +*****************************************************************************/ > static SaAisErrorT fm_clm_init() > { > SaAisErrorT rc = SA_AIS_OK; > @@ -622,8 +695,15 @@ static void fm_mbx_msg_handler(FM_CB *fm > * node_down event has been received. > */ > if (fm_cb->use_remote_fencing) { > - opensaf_reboot(fm_cb->peer_node_id, > (char *)fm_cb->peer_clm_node_name.value, > - "Received Node Down for > peer controller"); > + const char* clm_node_name = > get_clm_node_name(&fm_cb->peer_clm_node_name); > + if > (is_node_clm_member(&fm_cb->peer_clm_node_name)) { > + > opensaf_reboot(fm_cb->peer_node_id, clm_node_name, > + "Received Node > Down for peer controller"); > + } else { > + LOG_NO("Peer node %s is not a > member of the CLM cluster, fencing will not be performed", > + clm_node_name); > + } > + free((char*)clm_node_name); > } else { > opensaf_reboot(fm_cb->peer_node_id, > (char *)fm_cb->peer_node_name.value, > "Received Node Down for > peer controller"); > @@ -661,11 +741,15 @@ static void fm_mbx_msg_handler(FM_CB *fm > > LOG_NO("Reseting peer controller node id: %x", > fm_cb->peer_node_id); > if (fm_cb->use_remote_fencing) { > - LOG_NO("saClmClusterNodeGet succeeded node_id > 0x%X, clm peer node name %s", > - fm_mbx_evt->node_id, > fm_cb->peer_clm_node_name.value); > - > - opensaf_reboot(fm_cb->peer_node_id, (char > *)fm_cb->peer_clm_node_name.value, > - "Received Node Down for peer > controller"); > + const char* clm_node_name = > get_clm_node_name(&fm_cb->peer_clm_node_name); > + if > (is_node_clm_member(&fm_cb->peer_clm_node_name)) { > + opensaf_reboot(fm_cb->peer_node_id, > clm_node_name, > + "Received Node Down for > peer controller"); > + } else { > + LOG_NO("Peer node %s is not a member of > the CLM cluster, fencing will not be performed", > + clm_node_name); > + } > + free((char*)clm_node_name); > } else { > opensaf_reboot(fm_cb->peer_node_id, (char > *)fm_cb->peer_node_name.value, > "Received Node Down for Active > peer"); ------------------------------------------------------------------------------ Developer Access Program for Intel Xeon Phi Processors Access to Intel Xeon Phi processor-based developer platforms. With one year of Intel Parallel Studio XE. Training and support from Colfax. Order your platform today. http://sdm.link/xeonphi _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel