Hi Anders, as discussed in ticket #2094 where it is suggested to:
"document that admin needs to perform clm admin lock of standby controller before repairing" this could be complemented with: "document that admin needs to perform clm admin lock of standby controller before repairing or executing 'opensafd stop'" /Regards HansN On 11/11/2016 08:43 PM, Anders Widell wrote: > Ack, not tested. > > A comment is that I believe this works out of pure luck: both FM and > CLM act on node down or service down events, and by the time FM acts, > CLM has already removed the node from cluster membership. I am not > sure if it is guaranteed to work like this in the case of running > /etc/init.d/opensafd stop, and in the opposite way in case of node > failures. I ack the code anyway since I see this as a temporary > implementation. When we implement the "enhanced cluster management" > feature, we will anyhow have to revisit this (and most likely re-write > both FM and CLM..). > > regards, > > Anders Widell > > > On 11/04/2016 01:59 PM, Hans Nordeback wrote: >> osaf/services/infrastructure/fm/fms/Makefile.am | 6 +- >> osaf/services/infrastructure/fm/fms/fm_main.c | 114 >> ++++++++++++++++++++--- >> 2 files changed, 104 insertions(+), 16 deletions(-) >> >> >> diff --git a/osaf/services/infrastructure/fm/fms/Makefile.am >> b/osaf/services/infrastructure/fm/fms/Makefile.am >> --- a/osaf/services/infrastructure/fm/fms/Makefile.am >> +++ b/osaf/services/infrastructure/fm/fms/Makefile.am >> @@ -33,7 +33,8 @@ noinst_HEADERS = \ >> osaffmd_CPPFLAGS= \ >> $(AM_CPPFLAGS) \ >> - -I$(top_srcdir)/osaf/services/infrastructure/fm/include >> + -I$(top_srcdir)/osaf/services/infrastructure/fm/include \ >> + -I$(top_srcdir)/osaf/libs/common/immsv/include >> osaffmd_CFLAGS = $(AM_CFLAGS) >> @@ -44,6 +45,9 @@ osaffmd_SOURCES = \ >> fm_amf.c >> osaffmd_LDADD = \ >> + $(top_builddir)/osaf/tools/safimm/src/libimmutil.la \ >> + $(top_builddir)/osaf/libs/saf/libSaImm/libSaImmOi.la \ >> + $(top_builddir)/osaf/libs/saf/libSaImm/libSaImmOm.la \ >> $(top_builddir)/osaf/libs/core/libopensaf_core.la \ >> $(top_builddir)/osaf/libs/saf/libSaAmf/libSaAmf.la \ >> $(top_builddir)/osaf/libs/agents/infrastructure/rda/librda.la \ >> diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c >> b/osaf/services/infrastructure/fm/fms/fm_main.c >> --- a/osaf/services/infrastructure/fm/fms/fm_main.c >> +++ b/osaf/services/infrastructure/fm/fms/fm_main.c >> @@ -31,6 +31,7 @@ This file contains the main() routine fo >> #include <nid_api.h> >> #include "fm.h" >> #include "osaf_time.h" >> +#include "immutil.h" >> #define FM_CLM_API_TIMEOUT 10000000000LL >> @@ -62,6 +63,8 @@ static uint32_t fms_fms_exchange_node_in >> static uint32_t fm_nid_notify(uint32_t); >> static uint32_t fm_tmr_start(FM_TMR *, SaTimeT); >> static SaAisErrorT get_peer_clm_node_name(NODE_ID); >> +static const char* get_clm_node_name(const SaNameT* node_name); >> +static bool is_node_clm_member(const SaNameT *clm_node_name); >> static SaAisErrorT fm_clm_init(); >> static void fm_mbx_msg_handler(FM_CB *, FM_EVT *); >> static void fm_evt_proc_rda_callback(FM_CB*, FM_EVT*); >> @@ -499,6 +502,30 @@ void fm_proc_svc_down(FM_CB *cb, FM_EVT >> } >> /**************************************************************************** >> +* Name : get_node_name >> +* >> +* Description : Extract node name from DN clm node. >> +* >> +* Arguments : DN clm node name. >> +* >> +* Return Values : Extrated node name. >> +* >> +* Notes : None. >> +*****************************************************************************/ >> >> >> +static const char* get_clm_node_name(const SaNameT* node_name){ >> + SaNameT tmp_node_name = *node_name; >> + char *save_ptr; >> + // Extract peer clm node name, e.g SC-2 from >> "safNode=SC-2,safCluster=myClmCluster" >> + // The peer clm node name will be passed to opensaf_reboot >> script to support remote fencing. >> + // The peer clm node name should correspond to the name of the >> virtual machine for that node. >> + strtok_r((char*) tmp_node_name.value, "=", &save_ptr); >> + char *node = strtok_r(NULL, ",", &save_ptr); >> + char *tmp = strndup(node, strlen(node)); >> + LOG_NO("Peer clm node name: %s", tmp); >> + return tmp; >> +} >> + >> +/**************************************************************************** >> >> >> * Name : fm_clm_init >> * >> * Description : Initialize CLM. >> @@ -521,16 +548,10 @@ static SaAisErrorT get_peer_clm_node_nam >> } >> if ((rc = saClmClusterNodeGet_4(fm_cb->clm_hdl, node_id, >> FM_CLM_API_TIMEOUT, &cluster_node)) == SA_AIS_OK) { >> - // Extract peer clm node name, e.g SC-2 from >> "safNode=SC-2,safCluster=myClmCluster" >> - // The peer clm node name will be passed to opensaf_reboot >> script to support remote fencing. >> - // The peer clm node name should correspond to the name of >> the virtual machine for that node. >> - char *node = NULL; >> - strtok((char*) cluster_node.nodeName.value, "="); >> - node = strtok(NULL, ","); >> - strncpy((char*) fm_cb->peer_clm_node_name.value, node, >> cluster_node.nodeName.length); >> + fm_cb->peer_clm_node_name = cluster_node.nodeName; >> LOG_NO("Peer clm node name: %s", >> fm_cb->peer_clm_node_name.value); >> } else { >> - LOG_WA("saClmClusterNodeGet_4 returned %u", (unsigned) rc); >> + LOG_WA("saClmClusterNodeGet_4 returned %d", rc); >> } >> if ((rc = saClmFinalize(fm_cb->clm_hdl)) != SA_AIS_OK) { >> @@ -551,6 +572,58 @@ static SaAisErrorT get_peer_clm_node_nam >> * >> * Notes : None. >> *****************************************************************************/ >> +static bool is_node_clm_member(const SaNameT *clm_node_name) >> +{ >> + SaAisErrorT rc = SA_AIS_OK; >> + SaUint32T clm_member = 0; >> + SaNameT node_name = *clm_node_name; >> + >> + SaVersionT immVersion = { 'A', 2, 15 }; >> + const SaImmAttrValuesT_2 **attributes; >> + SaImmAccessorHandleT accessor_handle; >> + SaImmHandleT om_handle; >> + >> + if ((rc = immutil_saImmOmInitialize(&om_handle, NULL, >> &immVersion)) != SA_AIS_OK) { >> + LOG_ER("saImmOmInitialize FAILED: %u", rc); >> + goto done; >> + } >> + >> + if ((rc = immutil_saImmOmAccessorInitialize(om_handle, >> &accessor_handle)) != SA_AIS_OK) { >> + LOG_ER("saImmOmAccessorInitialize FAILED: %u", rc); >> + goto om_finalize; >> + } >> + >> + if ((rc = immutil_saImmOmAccessorGet_2(accessor_handle, >> &node_name, NULL, (SaImmAttrValuesT_2 ***) &attributes)) != SA_AIS_OK) { >> + LOG_ER("saImmOmAccessorGet_2 FAILED: %s %u ", >> node_name.value, rc); >> + goto accessor_finalize; >> + } >> + >> + if ((rc = immutil_getAttr("saClmNodeIsMember", attributes, 0, >> &clm_member)) != SA_AIS_OK) { >> + LOG_ER("immutil_getAttr FAILED: %u", rc); >> + } >> +accessor_finalize: >> + if ((rc = immutil_saImmOmAccessorFinalize(accessor_handle)) != >> SA_AIS_OK) { >> + LOG_NO("immutil_saImmOmAccessorFinalize FAILED: %u", rc); >> + } >> +om_finalize: >> + if ((rc = immutil_saImmOmFinalize(om_handle)) != SA_AIS_OK) { >> + LOG_NO("immutil_saImmOmFinalize FAILED: %u", rc); >> + } >> +done: >> + return (clm_member == 1) ? true : false; >> +} >> + >> +/**************************************************************************** >> >> >> +* Name : fm_clm_init >> +* >> +* Description : Initialize CLM. >> +* >> +* Arguments : None. >> +* >> +* Return Values : None. >> +* >> +* Notes : None. >> +*****************************************************************************/ >> >> >> static SaAisErrorT fm_clm_init() >> { >> SaAisErrorT rc = SA_AIS_OK; >> @@ -622,8 +695,15 @@ static void fm_mbx_msg_handler(FM_CB *fm >> * node_down event has been received. >> */ >> if (fm_cb->use_remote_fencing) { >> - opensaf_reboot(fm_cb->peer_node_id, (char >> *)fm_cb->peer_clm_node_name.value, >> - "Received Node Down for peer controller"); >> + const char* clm_node_name = >> get_clm_node_name(&fm_cb->peer_clm_node_name); >> + if >> (is_node_clm_member(&fm_cb->peer_clm_node_name)) { >> + opensaf_reboot(fm_cb->peer_node_id, >> clm_node_name, >> + "Received Node Down for peer >> controller"); >> + } else { >> + LOG_NO("Peer node %s is not a member of the >> CLM cluster, fencing will not be performed", >> + clm_node_name); >> + } >> + free((char*)clm_node_name); >> } else { >> opensaf_reboot(fm_cb->peer_node_id, (char >> *)fm_cb->peer_node_name.value, >> "Received Node Down for peer controller"); >> @@ -661,11 +741,15 @@ static void fm_mbx_msg_handler(FM_CB *fm >> LOG_NO("Reseting peer controller node id: %x", >> fm_cb->peer_node_id); >> if (fm_cb->use_remote_fencing) { >> - LOG_NO("saClmClusterNodeGet succeeded node_id 0x%X, >> clm peer node name %s", >> - fm_mbx_evt->node_id, >> fm_cb->peer_clm_node_name.value); >> - >> - opensaf_reboot(fm_cb->peer_node_id, (char >> *)fm_cb->peer_clm_node_name.value, >> - "Received Node Down for peer controller"); >> + const char* clm_node_name = >> get_clm_node_name(&fm_cb->peer_clm_node_name); >> + if (is_node_clm_member(&fm_cb->peer_clm_node_name)) { >> + opensaf_reboot(fm_cb->peer_node_id, clm_node_name, >> + "Received Node Down for peer controller"); >> + } else { >> + LOG_NO("Peer node %s is not a member of the CLM >> cluster, fencing will not be performed", >> + clm_node_name); >> + } >> + free((char*)clm_node_name); >> } else { >> opensaf_reboot(fm_cb->peer_node_id, (char >> *)fm_cb->peer_node_name.value, >> "Received Node Down for Active peer"); > ------------------------------------------------------------------------------ Developer Access Program for Intel Xeon Phi Processors Access to Intel Xeon Phi processor-based developer platforms. With one year of Intel Parallel Studio XE. Training and support from Colfax. Order your platform today. http://sdm.link/xeonphi _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel