If we are going to require a CLM node lock before stopping OpenSAF (as 
per my original proposal in the ticket), then maybe this patch should 
look at the node admin state rather than the membership status? This 
would remove the race between CLM and FM.

regards,

Anders Widell


On 11/14/2016 09:09 AM, Hans Nordeback wrote:
> Hi Anders,
>
> as discussed in ticket #2094 where it is suggested to:
>
> "document that admin needs to perform clm admin lock of standby 
> controller before repairing"
>
> this could be complemented with:
>
> "document that admin needs to perform clm admin lock of standby 
> controller before repairing or executing 'opensafd stop'"
>
> /Regards HansN
>
>
>
> On 11/11/2016 08:43 PM, Anders Widell wrote:
>> Ack, not tested.
>>
>> A comment is that I believe this works out of pure luck: both FM and 
>> CLM act on node down or service down events, and by the time FM acts, 
>> CLM has already removed the node from cluster membership. I am not 
>> sure if it is guaranteed to work like this in the case of running 
>> /etc/init.d/opensafd stop, and in the opposite way in case of node 
>> failures. I ack the code anyway since I see this as a temporary 
>> implementation. When we implement the "enhanced cluster management" 
>> feature, we will anyhow have to revisit this (and most likely 
>> re-write both FM and CLM..).
>>
>> regards,
>>
>> Anders Widell
>>
>>
>> On 11/04/2016 01:59 PM, Hans Nordeback wrote:
>>> osaf/services/infrastructure/fm/fms/Makefile.am |    6 +-
>>>   osaf/services/infrastructure/fm/fms/fm_main.c   |  114 
>>> ++++++++++++++++++++---
>>>   2 files changed, 104 insertions(+), 16 deletions(-)
>>>
>>>
>>> diff --git a/osaf/services/infrastructure/fm/fms/Makefile.am 
>>> b/osaf/services/infrastructure/fm/fms/Makefile.am
>>> --- a/osaf/services/infrastructure/fm/fms/Makefile.am
>>> +++ b/osaf/services/infrastructure/fm/fms/Makefile.am
>>> @@ -33,7 +33,8 @@ noinst_HEADERS = \
>>>     osaffmd_CPPFLAGS= \
>>>       $(AM_CPPFLAGS) \
>>> -    -I$(top_srcdir)/osaf/services/infrastructure/fm/include
>>> +    -I$(top_srcdir)/osaf/services/infrastructure/fm/include \
>>> +    -I$(top_srcdir)/osaf/libs/common/immsv/include
>>>     osaffmd_CFLAGS = $(AM_CFLAGS)
>>>   @@ -44,6 +45,9 @@ osaffmd_SOURCES = \
>>>       fm_amf.c
>>>     osaffmd_LDADD = \
>>> +    $(top_builddir)/osaf/tools/safimm/src/libimmutil.la \
>>> +    $(top_builddir)/osaf/libs/saf/libSaImm/libSaImmOi.la \
>>> +    $(top_builddir)/osaf/libs/saf/libSaImm/libSaImmOm.la \
>>>       $(top_builddir)/osaf/libs/core/libopensaf_core.la \
>>>       $(top_builddir)/osaf/libs/saf/libSaAmf/libSaAmf.la \
>>> $(top_builddir)/osaf/libs/agents/infrastructure/rda/librda.la \
>>> diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c 
>>> b/osaf/services/infrastructure/fm/fms/fm_main.c
>>> --- a/osaf/services/infrastructure/fm/fms/fm_main.c
>>> +++ b/osaf/services/infrastructure/fm/fms/fm_main.c
>>> @@ -31,6 +31,7 @@ This file contains the main() routine fo
>>>   #include <nid_api.h>
>>>   #include "fm.h"
>>>   #include "osaf_time.h"
>>> +#include "immutil.h"
>>>     #define FM_CLM_API_TIMEOUT 10000000000LL
>>>   @@ -62,6 +63,8 @@ static uint32_t fms_fms_exchange_node_in
>>>   static uint32_t fm_nid_notify(uint32_t);
>>>   static uint32_t fm_tmr_start(FM_TMR *, SaTimeT);
>>>   static SaAisErrorT get_peer_clm_node_name(NODE_ID);
>>> +static const char* get_clm_node_name(const SaNameT* node_name);
>>> +static bool is_node_clm_member(const SaNameT *clm_node_name);
>>>   static SaAisErrorT fm_clm_init();
>>>   static void fm_mbx_msg_handler(FM_CB *, FM_EVT *);
>>>   static void fm_evt_proc_rda_callback(FM_CB*, FM_EVT*);
>>> @@ -499,6 +502,30 @@ void fm_proc_svc_down(FM_CB *cb, FM_EVT
>>>   }
>>> /****************************************************************************
>>>  
>>>
>>> +* Name          : get_node_name
>>> +*
>>> +* Description   : Extract node name from DN clm node.
>>> +*
>>> +* Arguments     : DN clm node name.
>>> +*
>>> +* Return Values : Extrated node name.
>>> +*
>>> +* Notes         : None.
>>> +*****************************************************************************/
>>>  
>>>
>>> +static const char* get_clm_node_name(const SaNameT* node_name){
>>> +    SaNameT tmp_node_name = *node_name;
>>> +    char *save_ptr;
>>> +    // Extract peer clm node name, e.g SC-2 from 
>>> "safNode=SC-2,safCluster=myClmCluster"
>>> +    // The peer clm node name will be passed to opensaf_reboot 
>>> script to support remote fencing.
>>> +    // The peer clm node name should correspond to the name of the 
>>> virtual machine for that node.
>>> +    strtok_r((char*) tmp_node_name.value, "=", &save_ptr);
>>> +    char *node = strtok_r(NULL, ",", &save_ptr);
>>> +    char *tmp = strndup(node, strlen(node));
>>> +    LOG_NO("Peer clm node name: %s", tmp);
>>> +    return tmp;
>>> +}
>>> +
>>> +/****************************************************************************
>>>  
>>>
>>>   * Name          : fm_clm_init
>>>   *
>>>   * Description   : Initialize CLM.
>>> @@ -521,16 +548,10 @@ static SaAisErrorT get_peer_clm_node_nam
>>>       }
>>>         if ((rc = saClmClusterNodeGet_4(fm_cb->clm_hdl, node_id, 
>>> FM_CLM_API_TIMEOUT, &cluster_node)) == SA_AIS_OK) {
>>> -        // Extract peer clm node name, e.g SC-2 from 
>>> "safNode=SC-2,safCluster=myClmCluster"
>>> -        // The peer clm node name will be passed to opensaf_reboot 
>>> script to support remote fencing.
>>> -        // The peer clm node name should correspond to the name of 
>>> the virtual machine for that node.
>>> -        char *node = NULL;
>>> -        strtok((char*) cluster_node.nodeName.value, "=");
>>> -        node = strtok(NULL, ",");
>>> -        strncpy((char*) fm_cb->peer_clm_node_name.value, node, 
>>> cluster_node.nodeName.length);
>>> +        fm_cb->peer_clm_node_name = cluster_node.nodeName;
>>>           LOG_NO("Peer clm node name: %s", 
>>> fm_cb->peer_clm_node_name.value);
>>>       } else {
>>> -        LOG_WA("saClmClusterNodeGet_4 returned %u", (unsigned) rc);
>>> +        LOG_WA("saClmClusterNodeGet_4 returned %d", rc);
>>>       }
>>>         if ((rc = saClmFinalize(fm_cb->clm_hdl)) != SA_AIS_OK) {
>>> @@ -551,6 +572,58 @@ static SaAisErrorT get_peer_clm_node_nam
>>>   *
>>>   * Notes         : None.
>>> *****************************************************************************/
>>>  
>>>
>>> +static bool is_node_clm_member(const SaNameT *clm_node_name)
>>> +{
>>> +    SaAisErrorT rc = SA_AIS_OK;
>>> +    SaUint32T clm_member = 0;
>>> +    SaNameT node_name = *clm_node_name;
>>> +
>>> +    SaVersionT immVersion = { 'A', 2, 15 };
>>> +    const SaImmAttrValuesT_2 **attributes;
>>> +    SaImmAccessorHandleT accessor_handle;
>>> +    SaImmHandleT om_handle;
>>> +
>>> +    if ((rc = immutil_saImmOmInitialize(&om_handle, NULL, 
>>> &immVersion)) != SA_AIS_OK) {
>>> +        LOG_ER("saImmOmInitialize FAILED: %u", rc);
>>> +        goto done;
>>> +    }
>>> +
>>> +    if ((rc = immutil_saImmOmAccessorInitialize(om_handle, 
>>> &accessor_handle)) != SA_AIS_OK) {
>>> +        LOG_ER("saImmOmAccessorInitialize FAILED: %u", rc);
>>> +        goto om_finalize;
>>> +    }
>>> +
>>> +    if ((rc = immutil_saImmOmAccessorGet_2(accessor_handle, 
>>> &node_name, NULL, (SaImmAttrValuesT_2 ***) &attributes)) != 
>>> SA_AIS_OK) {
>>> +        LOG_ER("saImmOmAccessorGet_2 FAILED: %s %u ", 
>>> node_name.value, rc);
>>> +        goto accessor_finalize;
>>> +    }
>>> +
>>> +    if ((rc = immutil_getAttr("saClmNodeIsMember", attributes, 0, 
>>> &clm_member)) != SA_AIS_OK) {
>>> +        LOG_ER("immutil_getAttr FAILED: %u", rc);
>>> +    }
>>> +accessor_finalize:
>>> +    if ((rc = immutil_saImmOmAccessorFinalize(accessor_handle)) != 
>>> SA_AIS_OK) {
>>> +        LOG_NO("immutil_saImmOmAccessorFinalize FAILED: %u", rc);
>>> +    }
>>> +om_finalize:
>>> +    if ((rc = immutil_saImmOmFinalize(om_handle)) != SA_AIS_OK) {
>>> +        LOG_NO("immutil_saImmOmFinalize FAILED: %u", rc);
>>> +    }
>>> +done:
>>> +    return (clm_member == 1) ? true : false;
>>> +}
>>> +
>>> +/****************************************************************************
>>>  
>>>
>>> +* Name          : fm_clm_init
>>> +*
>>> +* Description   : Initialize CLM.
>>> +*
>>> +* Arguments     : None.
>>> +*
>>> +* Return Values : None.
>>> +*
>>> +* Notes         : None.
>>> +*****************************************************************************/
>>>  
>>>
>>>   static SaAisErrorT fm_clm_init()
>>>   {
>>>       SaAisErrorT rc = SA_AIS_OK;
>>> @@ -622,8 +695,15 @@ static void fm_mbx_msg_handler(FM_CB *fm
>>>                        * node_down event has been received.
>>>                        */
>>>                   if (fm_cb->use_remote_fencing) {
>>> -                    opensaf_reboot(fm_cb->peer_node_id, (char 
>>> *)fm_cb->peer_clm_node_name.value,
>>> -                            "Received Node Down for peer controller");
>>> +                    const char* clm_node_name = 
>>> get_clm_node_name(&fm_cb->peer_clm_node_name);
>>> +                    if 
>>> (is_node_clm_member(&fm_cb->peer_clm_node_name)) {
>>> + opensaf_reboot(fm_cb->peer_node_id, clm_node_name,
>>> +                                "Received Node Down for peer 
>>> controller");
>>> +                    } else {
>>> +                        LOG_NO("Peer node %s is not a member of the 
>>> CLM cluster, fencing will not be performed",
>>> +                                clm_node_name);
>>> +                    }
>>> +                    free((char*)clm_node_name);
>>>                   } else {
>>>                       opensaf_reboot(fm_cb->peer_node_id, (char 
>>> *)fm_cb->peer_node_name.value,
>>>                               "Received Node Down for peer 
>>> controller");
>>> @@ -661,11 +741,15 @@ static void fm_mbx_msg_handler(FM_CB *fm
>>>                 LOG_NO("Reseting peer controller node id: %x", 
>>> fm_cb->peer_node_id);
>>>               if (fm_cb->use_remote_fencing) {
>>> -                LOG_NO("saClmClusterNodeGet succeeded node_id 0x%X, 
>>> clm peer node name %s",
>>> -                    fm_mbx_evt->node_id, 
>>> fm_cb->peer_clm_node_name.value);
>>> -
>>> -                opensaf_reboot(fm_cb->peer_node_id, (char 
>>> *)fm_cb->peer_clm_node_name.value,
>>> -                        "Received Node Down for peer controller");
>>> +                const char* clm_node_name = 
>>> get_clm_node_name(&fm_cb->peer_clm_node_name);
>>> +                if (is_node_clm_member(&fm_cb->peer_clm_node_name)) {
>>> +                    opensaf_reboot(fm_cb->peer_node_id, clm_node_name,
>>> +                            "Received Node Down for peer controller");
>>> +                } else {
>>> +                    LOG_NO("Peer node %s is not a member of the CLM 
>>> cluster, fencing will not be performed",
>>> +                            clm_node_name);
>>> +                }
>>> +                free((char*)clm_node_name);
>>>               } else {
>>>                   opensaf_reboot(fm_cb->peer_node_id, (char 
>>> *)fm_cb->peer_node_name.value,
>>>                              "Received Node Down for Active peer");
>>
>


------------------------------------------------------------------------------
Developer Access Program for Intel Xeon Phi Processors
Access to Intel Xeon Phi processor-based developer platforms.
With one year of Intel Parallel Studio XE.
Training and support from Colfax.
Order your platform today. http://sdm.link/xeonphi
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to