Hi Hans,

Introducing a pre-condition that the CLM node should be administratively locked 
before stopping the middleware is 
Firstly Not a backward compatible change.
We also discussed this in the TLC last week and felt that in this release it is 
relevant to 
Maintain backward compatibility and that 'the fencing mechanism should not 
kick-in/reboot upon stopping the middleware'.

This means we should figure out other means of differentiating a hung node 
versus a stopped node.
One idea that was discussed as - Perhaps some handshaking between the two nodes 
when stopping the middleware on the other node and
Reboot can be attempted after some itmeout.

Thanks,
Mathi.

> -----Original Message-----
> From: Hans Nordebäck [mailto:hans.nordeb...@ericsson.com]
> Sent: Monday, November 21, 2016 7:32 PM
> To: Ramesh Babu Betham; Mathivanan Naickan Palanivelu
> Cc: opensaf-devel@lists.sourceforge.net; Anders Widell
> Subject: RE: [devel] [PATCH 1 of 1] fm: Add support for differentiating a hung
> node versus a stopped node [#2160]
> 
> Hi Ramesh and Mathi,
> Have you had time to review this patch?
> /Regards HansN
> 
> -----Original Message-----
> From: Hans Nordeback [mailto:hans.nordeb...@ericsson.com]
> Sent: den 4 november 2016 14:00
> To: ramesh.bet...@oracle.com; mathi.naic...@oracle.com; Anders Widell
> <anders.wid...@ericsson.com>
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: [devel] [PATCH 1 of 1] fm: Add support for differentiating a hung
> node versus a stopped node [#2160]
> 
>  osaf/services/infrastructure/fm/fms/Makefile.am |    6 +-
>  osaf/services/infrastructure/fm/fms/fm_main.c   |  114
> ++++++++++++++++++++---
>  2 files changed, 104 insertions(+), 16 deletions(-)
> 
> 
> diff --git a/osaf/services/infrastructure/fm/fms/Makefile.am
> b/osaf/services/infrastructure/fm/fms/Makefile.am
> --- a/osaf/services/infrastructure/fm/fms/Makefile.am
> +++ b/osaf/services/infrastructure/fm/fms/Makefile.am
> @@ -33,7 +33,8 @@ noinst_HEADERS = \
> 
>  osaffmd_CPPFLAGS= \
>       $(AM_CPPFLAGS) \
> -     -I$(top_srcdir)/osaf/services/infrastructure/fm/include
> +     -I$(top_srcdir)/osaf/services/infrastructure/fm/include \
> +     -I$(top_srcdir)/osaf/libs/common/immsv/include
> 
>  osaffmd_CFLAGS = $(AM_CFLAGS)
> 
> @@ -44,6 +45,9 @@ osaffmd_SOURCES = \
>       fm_amf.c
> 
>  osaffmd_LDADD = \
> +     $(top_builddir)/osaf/tools/safimm/src/libimmutil.la \
> +     $(top_builddir)/osaf/libs/saf/libSaImm/libSaImmOi.la \
> +     $(top_builddir)/osaf/libs/saf/libSaImm/libSaImmOm.la \
>       $(top_builddir)/osaf/libs/core/libopensaf_core.la \
>       $(top_builddir)/osaf/libs/saf/libSaAmf/libSaAmf.la \
>       $(top_builddir)/osaf/libs/agents/infrastructure/rda/librda.la \ diff --
> git a/osaf/services/infrastructure/fm/fms/fm_main.c
> b/osaf/services/infrastructure/fm/fms/fm_main.c
> --- a/osaf/services/infrastructure/fm/fms/fm_main.c
> +++ b/osaf/services/infrastructure/fm/fms/fm_main.c
> @@ -31,6 +31,7 @@ This file contains the main() routine fo  #include
> <nid_api.h>  #include "fm.h"
>  #include "osaf_time.h"
> +#include "immutil.h"
> 
>  #define FM_CLM_API_TIMEOUT 10000000000LL
> 
> @@ -62,6 +63,8 @@ static uint32_t fms_fms_exchange_node_in  static
> uint32_t fm_nid_notify(uint32_t);  static uint32_t fm_tmr_start(FM_TMR *,
> SaTimeT);  static SaAisErrorT get_peer_clm_node_name(NODE_ID);
> +static const char* get_clm_node_name(const SaNameT* node_name);
> static
> +bool is_node_clm_member(const SaNameT *clm_node_name);
>  static SaAisErrorT fm_clm_init();
>  static void fm_mbx_msg_handler(FM_CB *, FM_EVT *);  static void
> fm_evt_proc_rda_callback(FM_CB*, FM_EVT*); @@ -499,6 +502,30 @@
> void fm_proc_svc_down(FM_CB *cb, FM_EVT  }
> 
> 
> /**********************************************************
> ******************
> +* Name          : get_node_name
> +*
> +* Description   : Extract node name from DN clm node.
> +*
> +* Arguments     : DN clm node name.
> +*
> +* Return Values : Extrated node name.
> +*
> +* Notes         : None.
> +*********************************************************
> **************
> +******/ static const char* get_clm_node_name(const SaNameT*
> node_name){
> +     SaNameT tmp_node_name = *node_name;
> +     char *save_ptr;
> +     // Extract peer clm node name, e.g SC-2 from "safNode=SC-
> 2,safCluster=myClmCluster"
> +     // The peer clm node name will be passed to opensaf_reboot script
> to support remote fencing.
> +     // The peer clm node name should correspond to the name of the
> virtual machine for that node.
> +     strtok_r((char*) tmp_node_name.value, "=", &save_ptr);
> +     char *node = strtok_r(NULL, ",", &save_ptr);
> +     char *tmp = strndup(node, strlen(node));
> +     LOG_NO("Peer clm node name: %s", tmp);
> +     return tmp;
> +}
> +
> +/*********************************************************
> *************
> +******
>  * Name          : fm_clm_init
>  *
>  * Description   : Initialize CLM.
> @@ -521,16 +548,10 @@ static SaAisErrorT get_peer_clm_node_nam
>       }
> 
>       if ((rc = saClmClusterNodeGet_4(fm_cb->clm_hdl, node_id,
> FM_CLM_API_TIMEOUT, &cluster_node)) == SA_AIS_OK) {
> -             // Extract peer clm node name, e.g SC-2 from "safNode=SC-
> 2,safCluster=myClmCluster"
> -             // The peer clm node name will be passed to opensaf_reboot
> script to support remote fencing.
> -             // The peer clm node name should correspond to the name
> of the virtual machine for that node.
> -             char *node = NULL;
> -             strtok((char*) cluster_node.nodeName.value, "=");
> -             node = strtok(NULL, ",");
> -             strncpy((char*) fm_cb->peer_clm_node_name.value, node,
> cluster_node.nodeName.length);
> +             fm_cb->peer_clm_node_name = cluster_node.nodeName;
>               LOG_NO("Peer clm node name: %s", fm_cb-
> >peer_clm_node_name.value);
>       } else {
> -             LOG_WA("saClmClusterNodeGet_4 returned %u",
> (unsigned) rc);
> +             LOG_WA("saClmClusterNodeGet_4 returned %d", rc);
>       }
> 
>       if ((rc = saClmFinalize(fm_cb->clm_hdl)) != SA_AIS_OK) { @@ -551,6
> +572,58 @@ static SaAisErrorT get_peer_clm_node_nam
>  *
>  * Notes         : None.
> 
> **********************************************************
> *******************/
> +static bool is_node_clm_member(const SaNameT *clm_node_name) {
> +     SaAisErrorT rc = SA_AIS_OK;
> +     SaUint32T clm_member = 0;
> +     SaNameT node_name = *clm_node_name;
> +
> +     SaVersionT immVersion = { 'A', 2, 15 };
> +     const SaImmAttrValuesT_2 **attributes;
> +     SaImmAccessorHandleT accessor_handle;
> +     SaImmHandleT om_handle;
> +
> +     if ((rc = immutil_saImmOmInitialize(&om_handle, NULL,
> &immVersion)) != SA_AIS_OK) {
> +             LOG_ER("saImmOmInitialize FAILED: %u", rc);
> +             goto done;
> +     }
> +
> +     if ((rc = immutil_saImmOmAccessorInitialize(om_handle,
> &accessor_handle)) != SA_AIS_OK) {
> +             LOG_ER("saImmOmAccessorInitialize FAILED: %u", rc);
> +             goto om_finalize;
> +     }
> +
> +     if ((rc = immutil_saImmOmAccessorGet_2(accessor_handle,
> &node_name, NULL, (SaImmAttrValuesT_2 ***) &attributes)) !=
> SA_AIS_OK) {
> +             LOG_ER("saImmOmAccessorGet_2 FAILED: %s %u ",
> node_name.value, rc);
> +             goto accessor_finalize;
> +     }
> +
> +     if ((rc = immutil_getAttr("saClmNodeIsMember", attributes, 0,
> &clm_member)) != SA_AIS_OK) {
> +             LOG_ER("immutil_getAttr FAILED: %u", rc);
> +     }
> +accessor_finalize:
> +     if ((rc = immutil_saImmOmAccessorFinalize(accessor_handle)) !=
> SA_AIS_OK) {
> +             LOG_NO("immutil_saImmOmAccessorFinalize FAILED: %u",
> rc);
> +     }
> +om_finalize:
> +     if ((rc = immutil_saImmOmFinalize(om_handle)) != SA_AIS_OK) {
> +             LOG_NO("immutil_saImmOmFinalize FAILED: %u", rc);
> +     }
> +done:
> +     return (clm_member == 1) ? true : false; }
> +
> +/*********************************************************
> *******************
> +* Name          : fm_clm_init
> +*
> +* Description   : Initialize CLM.
> +*
> +* Arguments     : None.
> +*
> +* Return Values : None.
> +*
> +* Notes         : None.
> +*********************************************************
> **************
> +******/
>  static SaAisErrorT fm_clm_init()
>  {
>       SaAisErrorT rc = SA_AIS_OK;
> @@ -622,8 +695,15 @@ static void fm_mbx_msg_handler(FM_CB *fm
>                                        * node_down event has been
> received.
>                                        */
>                               if (fm_cb->use_remote_fencing) {
> -                                     opensaf_reboot(fm_cb-
> >peer_node_id, (char *)fm_cb->peer_clm_node_name.value,
> -                                                     "Received Node
> Down for peer controller");
> +                                     const char* clm_node_name =
> get_clm_node_name(&fm_cb->peer_clm_node_name);
> +                                     if (is_node_clm_member(&fm_cb-
> >peer_clm_node_name)) {
> +                                             opensaf_reboot(fm_cb-
> >peer_node_id, clm_node_name,
> +                                                             "Received
> Node Down for peer controller");
> +                                     } else {
> +                                             LOG_NO("Peer node %s is
> not a member of the CLM cluster, fencing will not be performed",
> +
>       clm_node_name);
> +                                     }
> +                                     free((char*)clm_node_name);
>                               } else {
>                                       opensaf_reboot(fm_cb-
> >peer_node_id, (char *)fm_cb->peer_node_name.value,
>                                                       "Received Node
> Down for peer controller"); @@ -661,11 +741,15 @@ static void
> fm_mbx_msg_handler(FM_CB *fm
> 
>                       LOG_NO("Reseting peer controller node id: %x",
> fm_cb->peer_node_id);
>                       if (fm_cb->use_remote_fencing) {
> -                             LOG_NO("saClmClusterNodeGet succeeded
> node_id 0x%X, clm peer node name %s",
> -                                     fm_mbx_evt->node_id, fm_cb-
> >peer_clm_node_name.value);
> -
> -                             opensaf_reboot(fm_cb->peer_node_id,
> (char *)fm_cb->peer_clm_node_name.value,
> -                                             "Received Node Down for
> peer controller");
> +                             const char* clm_node_name =
> get_clm_node_name(&fm_cb->peer_clm_node_name);
> +                             if (is_node_clm_member(&fm_cb-
> >peer_clm_node_name)) {
> +                                     opensaf_reboot(fm_cb-
> >peer_node_id, clm_node_name,
> +                                                     "Received Node
> Down for peer controller");
> +                             } else {
> +                                     LOG_NO("Peer node %s is not a
> member of the CLM cluster, fencing will not be performed",
> +                                                     clm_node_name);
> +                             }
> +                             free((char*)clm_node_name);
>                       } else {
>                               opensaf_reboot(fm_cb->peer_node_id,
> (char *)fm_cb->peer_node_name.value,
>                                              "Received Node Down for Active
> peer");
> 
> ------------------------------------------------------------------------------
> Developer Access Program for Intel Xeon Phi Processors Access to Intel Xeon
> Phi processor-based developer platforms.
> With one year of Intel Parallel Studio XE.
> Training and support from Colfax.
> Order your platform today. http://sdm.link/xeonphi
> _______________________________________________
> Opensaf-devel mailing list
> Opensaf-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/opensaf-devel

------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to