Ack, not tested.

A comment is that I believe this works out of pure luck: both FM and CLM 
act on node down or service down events, and by the time FM acts, CLM 
has already removed the node from cluster membership. I am not sure if 
it is guaranteed to work like this in the case of running 
/etc/init.d/opensafd stop, and in the opposite way in case of node 
failures. I ack the code anyway since I see this as a temporary 
implementation. When we implement the "enhanced cluster management" 
feature, we will anyhow have to revisit this (and most likely re-write 
both FM and CLM..).

regards,

Anders Widell


On 11/04/2016 01:59 PM, Hans Nordeback wrote:
>   osaf/services/infrastructure/fm/fms/Makefile.am |    6 +-
>   osaf/services/infrastructure/fm/fms/fm_main.c   |  114 
> ++++++++++++++++++++---
>   2 files changed, 104 insertions(+), 16 deletions(-)
>
>
> diff --git a/osaf/services/infrastructure/fm/fms/Makefile.am 
> b/osaf/services/infrastructure/fm/fms/Makefile.am
> --- a/osaf/services/infrastructure/fm/fms/Makefile.am
> +++ b/osaf/services/infrastructure/fm/fms/Makefile.am
> @@ -33,7 +33,8 @@ noinst_HEADERS = \
>   
>   osaffmd_CPPFLAGS= \
>       $(AM_CPPFLAGS) \
> -     -I$(top_srcdir)/osaf/services/infrastructure/fm/include
> +     -I$(top_srcdir)/osaf/services/infrastructure/fm/include \
> +     -I$(top_srcdir)/osaf/libs/common/immsv/include
>   
>   osaffmd_CFLAGS = $(AM_CFLAGS)
>   
> @@ -44,6 +45,9 @@ osaffmd_SOURCES = \
>       fm_amf.c
>   
>   osaffmd_LDADD = \
> +     $(top_builddir)/osaf/tools/safimm/src/libimmutil.la \
> +     $(top_builddir)/osaf/libs/saf/libSaImm/libSaImmOi.la \
> +     $(top_builddir)/osaf/libs/saf/libSaImm/libSaImmOm.la \
>       $(top_builddir)/osaf/libs/core/libopensaf_core.la \
>       $(top_builddir)/osaf/libs/saf/libSaAmf/libSaAmf.la \
>       $(top_builddir)/osaf/libs/agents/infrastructure/rda/librda.la \
> diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c 
> b/osaf/services/infrastructure/fm/fms/fm_main.c
> --- a/osaf/services/infrastructure/fm/fms/fm_main.c
> +++ b/osaf/services/infrastructure/fm/fms/fm_main.c
> @@ -31,6 +31,7 @@ This file contains the main() routine fo
>   #include <nid_api.h>
>   #include "fm.h"
>   #include "osaf_time.h"
> +#include "immutil.h"
>   
>   #define FM_CLM_API_TIMEOUT 10000000000LL
>   
> @@ -62,6 +63,8 @@ static uint32_t fms_fms_exchange_node_in
>   static uint32_t fm_nid_notify(uint32_t);
>   static uint32_t fm_tmr_start(FM_TMR *, SaTimeT);
>   static SaAisErrorT get_peer_clm_node_name(NODE_ID);
> +static const char* get_clm_node_name(const SaNameT* node_name);
> +static bool is_node_clm_member(const SaNameT *clm_node_name);
>   static SaAisErrorT fm_clm_init();
>   static void fm_mbx_msg_handler(FM_CB *, FM_EVT *);
>   static void fm_evt_proc_rda_callback(FM_CB*, FM_EVT*);
> @@ -499,6 +502,30 @@ void fm_proc_svc_down(FM_CB *cb, FM_EVT
>   }
>   
>   
> /****************************************************************************
> +* Name          : get_node_name
> +*
> +* Description   : Extract node name from DN clm node.
> +*
> +* Arguments     : DN clm node name.
> +*
> +* Return Values : Extrated node name.
> +*
> +* Notes         : None.
> +*****************************************************************************/
> +static const char* get_clm_node_name(const SaNameT* node_name){
> +     SaNameT tmp_node_name = *node_name;
> +     char *save_ptr;
> +     // Extract peer clm node name, e.g SC-2 from 
> "safNode=SC-2,safCluster=myClmCluster"
> +     // The peer clm node name will be passed to opensaf_reboot script to 
> support remote fencing.
> +     // The peer clm node name should correspond to the name of the virtual 
> machine for that node.
> +     strtok_r((char*) tmp_node_name.value, "=", &save_ptr);
> +     char *node = strtok_r(NULL, ",", &save_ptr);
> +     char *tmp = strndup(node, strlen(node));
> +     LOG_NO("Peer clm node name: %s", tmp);
> +     return tmp;
> +}
> +
> +/****************************************************************************
>   * Name          : fm_clm_init
>   *
>   * Description   : Initialize CLM.
> @@ -521,16 +548,10 @@ static SaAisErrorT get_peer_clm_node_nam
>       }
>   
>       if ((rc = saClmClusterNodeGet_4(fm_cb->clm_hdl, node_id, 
> FM_CLM_API_TIMEOUT, &cluster_node)) == SA_AIS_OK) {
> -             // Extract peer clm node name, e.g SC-2 from 
> "safNode=SC-2,safCluster=myClmCluster"
> -             // The peer clm node name will be passed to opensaf_reboot 
> script to support remote fencing.
> -             // The peer clm node name should correspond to the name of the 
> virtual machine for that node.
> -             char *node = NULL;
> -             strtok((char*) cluster_node.nodeName.value, "=");
> -             node = strtok(NULL, ",");
> -             strncpy((char*) fm_cb->peer_clm_node_name.value, node, 
> cluster_node.nodeName.length);
> +             fm_cb->peer_clm_node_name = cluster_node.nodeName;
>               LOG_NO("Peer clm node name: %s", 
> fm_cb->peer_clm_node_name.value);
>       } else {
> -             LOG_WA("saClmClusterNodeGet_4 returned %u", (unsigned) rc);
> +             LOG_WA("saClmClusterNodeGet_4 returned %d", rc);
>       }
>   
>       if ((rc = saClmFinalize(fm_cb->clm_hdl)) != SA_AIS_OK) {
> @@ -551,6 +572,58 @@ static SaAisErrorT get_peer_clm_node_nam
>   *
>   * Notes         : None.
>   
> *****************************************************************************/
> +static bool is_node_clm_member(const SaNameT *clm_node_name)
> +{
> +     SaAisErrorT rc = SA_AIS_OK;
> +     SaUint32T clm_member = 0;
> +     SaNameT node_name = *clm_node_name;
> +
> +     SaVersionT immVersion = { 'A', 2, 15 };
> +     const SaImmAttrValuesT_2 **attributes;
> +     SaImmAccessorHandleT accessor_handle;
> +     SaImmHandleT om_handle;
> +
> +     if ((rc = immutil_saImmOmInitialize(&om_handle, NULL, &immVersion)) != 
> SA_AIS_OK) {
> +             LOG_ER("saImmOmInitialize FAILED: %u", rc);
> +             goto done;
> +     }
> +
> +     if ((rc = immutil_saImmOmAccessorInitialize(om_handle, 
> &accessor_handle)) != SA_AIS_OK) {
> +             LOG_ER("saImmOmAccessorInitialize FAILED: %u", rc);
> +             goto om_finalize;
> +     }
> +
> +     if ((rc = immutil_saImmOmAccessorGet_2(accessor_handle, &node_name, 
> NULL, (SaImmAttrValuesT_2 ***) &attributes)) != SA_AIS_OK) {
> +             LOG_ER("saImmOmAccessorGet_2 FAILED: %s %u ", node_name.value, 
> rc);
> +             goto accessor_finalize;
> +     }
> +
> +     if ((rc = immutil_getAttr("saClmNodeIsMember", attributes, 0, 
> &clm_member)) != SA_AIS_OK) {
> +             LOG_ER("immutil_getAttr FAILED: %u", rc);
> +     }
> +accessor_finalize:
> +     if ((rc = immutil_saImmOmAccessorFinalize(accessor_handle)) != 
> SA_AIS_OK) {
> +             LOG_NO("immutil_saImmOmAccessorFinalize FAILED: %u", rc);
> +     }
> +om_finalize:
> +     if ((rc = immutil_saImmOmFinalize(om_handle)) != SA_AIS_OK) {
> +             LOG_NO("immutil_saImmOmFinalize FAILED: %u", rc);
> +     }
> +done:
> +     return (clm_member == 1) ? true : false;
> +}
> +
> +/****************************************************************************
> +* Name          : fm_clm_init
> +*
> +* Description   : Initialize CLM.
> +*
> +* Arguments     : None.
> +*
> +* Return Values : None.
> +*
> +* Notes         : None.
> +*****************************************************************************/
>   static SaAisErrorT fm_clm_init()
>   {
>       SaAisErrorT rc = SA_AIS_OK;
> @@ -622,8 +695,15 @@ static void fm_mbx_msg_handler(FM_CB *fm
>                                        * node_down event has been received.
>                                        */
>                               if (fm_cb->use_remote_fencing) {
> -                                     opensaf_reboot(fm_cb->peer_node_id, 
> (char *)fm_cb->peer_clm_node_name.value,
> -                                                     "Received Node Down for 
> peer controller");
> +                                     const char* clm_node_name = 
> get_clm_node_name(&fm_cb->peer_clm_node_name);
> +                                     if 
> (is_node_clm_member(&fm_cb->peer_clm_node_name)) {
> +                                             
> opensaf_reboot(fm_cb->peer_node_id, clm_node_name,
> +                                                             "Received Node 
> Down for peer controller");
> +                                     } else {
> +                                             LOG_NO("Peer node %s is not a 
> member of the CLM cluster, fencing will not be performed",
> +                                                             clm_node_name);
> +                                     }
> +                                     free((char*)clm_node_name);
>                               } else {
>                                       opensaf_reboot(fm_cb->peer_node_id, 
> (char *)fm_cb->peer_node_name.value,
>                                                       "Received Node Down for 
> peer controller");
> @@ -661,11 +741,15 @@ static void fm_mbx_msg_handler(FM_CB *fm
>   
>                       LOG_NO("Reseting peer controller node id: %x", 
> fm_cb->peer_node_id);
>                       if (fm_cb->use_remote_fencing) {
> -                             LOG_NO("saClmClusterNodeGet succeeded node_id 
> 0x%X, clm peer node name %s",
> -                                     fm_mbx_evt->node_id, 
> fm_cb->peer_clm_node_name.value);
> -
> -                             opensaf_reboot(fm_cb->peer_node_id, (char 
> *)fm_cb->peer_clm_node_name.value,
> -                                             "Received Node Down for peer 
> controller");
> +                             const char* clm_node_name = 
> get_clm_node_name(&fm_cb->peer_clm_node_name);
> +                             if 
> (is_node_clm_member(&fm_cb->peer_clm_node_name)) {
> +                                     opensaf_reboot(fm_cb->peer_node_id, 
> clm_node_name,
> +                                                     "Received Node Down for 
> peer controller");
> +                             } else {
> +                                     LOG_NO("Peer node %s is not a member of 
> the CLM cluster, fencing will not be performed",
> +                                                     clm_node_name);
> +                             }
> +                             free((char*)clm_node_name);
>                       } else {
>                               opensaf_reboot(fm_cb->peer_node_id, (char 
> *)fm_cb->peer_node_name.value,
>                                              "Received Node Down for Active 
> peer");


------------------------------------------------------------------------------
Developer Access Program for Intel Xeon Phi Processors
Access to Intel Xeon Phi processor-based developer platforms.
With one year of Intel Parallel Studio XE.
Training and support from Colfax.
Order your platform today. http://sdm.link/xeonphi
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to