Hi Anders,

as discussed in ticket #2094 where it is suggested to:

"document that admin needs to perform clm admin lock of standby 
controller before repairing"

this could be complemented with:

"document that admin needs to perform clm admin lock of standby 
controller before repairing or executing 'opensafd stop'"

/Regards HansN



On 11/11/2016 08:43 PM, Anders Widell wrote:
> Ack, not tested.
>
> A comment is that I believe this works out of pure luck: both FM and 
> CLM act on node down or service down events, and by the time FM acts, 
> CLM has already removed the node from cluster membership. I am not 
> sure if it is guaranteed to work like this in the case of running 
> /etc/init.d/opensafd stop, and in the opposite way in case of node 
> failures. I ack the code anyway since I see this as a temporary 
> implementation. When we implement the "enhanced cluster management" 
> feature, we will anyhow have to revisit this (and most likely re-write 
> both FM and CLM..).
>
> regards,
>
> Anders Widell
>
>
> On 11/04/2016 01:59 PM, Hans Nordeback wrote:
>> osaf/services/infrastructure/fm/fms/Makefile.am |    6 +-
>>   osaf/services/infrastructure/fm/fms/fm_main.c   |  114 
>> ++++++++++++++++++++---
>>   2 files changed, 104 insertions(+), 16 deletions(-)
>>
>>
>> diff --git a/osaf/services/infrastructure/fm/fms/Makefile.am 
>> b/osaf/services/infrastructure/fm/fms/Makefile.am
>> --- a/osaf/services/infrastructure/fm/fms/Makefile.am
>> +++ b/osaf/services/infrastructure/fm/fms/Makefile.am
>> @@ -33,7 +33,8 @@ noinst_HEADERS = \
>>     osaffmd_CPPFLAGS= \
>>       $(AM_CPPFLAGS) \
>> -    -I$(top_srcdir)/osaf/services/infrastructure/fm/include
>> +    -I$(top_srcdir)/osaf/services/infrastructure/fm/include \
>> +    -I$(top_srcdir)/osaf/libs/common/immsv/include
>>     osaffmd_CFLAGS = $(AM_CFLAGS)
>>   @@ -44,6 +45,9 @@ osaffmd_SOURCES = \
>>       fm_amf.c
>>     osaffmd_LDADD = \
>> +    $(top_builddir)/osaf/tools/safimm/src/libimmutil.la \
>> +    $(top_builddir)/osaf/libs/saf/libSaImm/libSaImmOi.la \
>> +    $(top_builddir)/osaf/libs/saf/libSaImm/libSaImmOm.la \
>>       $(top_builddir)/osaf/libs/core/libopensaf_core.la \
>>       $(top_builddir)/osaf/libs/saf/libSaAmf/libSaAmf.la \
>> $(top_builddir)/osaf/libs/agents/infrastructure/rda/librda.la \
>> diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c 
>> b/osaf/services/infrastructure/fm/fms/fm_main.c
>> --- a/osaf/services/infrastructure/fm/fms/fm_main.c
>> +++ b/osaf/services/infrastructure/fm/fms/fm_main.c
>> @@ -31,6 +31,7 @@ This file contains the main() routine fo
>>   #include <nid_api.h>
>>   #include "fm.h"
>>   #include "osaf_time.h"
>> +#include "immutil.h"
>>     #define FM_CLM_API_TIMEOUT 10000000000LL
>>   @@ -62,6 +63,8 @@ static uint32_t fms_fms_exchange_node_in
>>   static uint32_t fm_nid_notify(uint32_t);
>>   static uint32_t fm_tmr_start(FM_TMR *, SaTimeT);
>>   static SaAisErrorT get_peer_clm_node_name(NODE_ID);
>> +static const char* get_clm_node_name(const SaNameT* node_name);
>> +static bool is_node_clm_member(const SaNameT *clm_node_name);
>>   static SaAisErrorT fm_clm_init();
>>   static void fm_mbx_msg_handler(FM_CB *, FM_EVT *);
>>   static void fm_evt_proc_rda_callback(FM_CB*, FM_EVT*);
>> @@ -499,6 +502,30 @@ void fm_proc_svc_down(FM_CB *cb, FM_EVT
>>   }
>> /****************************************************************************
>> +* Name          : get_node_name
>> +*
>> +* Description   : Extract node name from DN clm node.
>> +*
>> +* Arguments     : DN clm node name.
>> +*
>> +* Return Values : Extrated node name.
>> +*
>> +* Notes         : None.
>> +*****************************************************************************/
>>  
>>
>> +static const char* get_clm_node_name(const SaNameT* node_name){
>> +    SaNameT tmp_node_name = *node_name;
>> +    char *save_ptr;
>> +    // Extract peer clm node name, e.g SC-2 from 
>> "safNode=SC-2,safCluster=myClmCluster"
>> +    // The peer clm node name will be passed to opensaf_reboot 
>> script to support remote fencing.
>> +    // The peer clm node name should correspond to the name of the 
>> virtual machine for that node.
>> +    strtok_r((char*) tmp_node_name.value, "=", &save_ptr);
>> +    char *node = strtok_r(NULL, ",", &save_ptr);
>> +    char *tmp = strndup(node, strlen(node));
>> +    LOG_NO("Peer clm node name: %s", tmp);
>> +    return tmp;
>> +}
>> +
>> +/****************************************************************************
>>  
>>
>>   * Name          : fm_clm_init
>>   *
>>   * Description   : Initialize CLM.
>> @@ -521,16 +548,10 @@ static SaAisErrorT get_peer_clm_node_nam
>>       }
>>         if ((rc = saClmClusterNodeGet_4(fm_cb->clm_hdl, node_id, 
>> FM_CLM_API_TIMEOUT, &cluster_node)) == SA_AIS_OK) {
>> -        // Extract peer clm node name, e.g SC-2 from 
>> "safNode=SC-2,safCluster=myClmCluster"
>> -        // The peer clm node name will be passed to opensaf_reboot 
>> script to support remote fencing.
>> -        // The peer clm node name should correspond to the name of 
>> the virtual machine for that node.
>> -        char *node = NULL;
>> -        strtok((char*) cluster_node.nodeName.value, "=");
>> -        node = strtok(NULL, ",");
>> -        strncpy((char*) fm_cb->peer_clm_node_name.value, node, 
>> cluster_node.nodeName.length);
>> +        fm_cb->peer_clm_node_name = cluster_node.nodeName;
>>           LOG_NO("Peer clm node name: %s", 
>> fm_cb->peer_clm_node_name.value);
>>       } else {
>> -        LOG_WA("saClmClusterNodeGet_4 returned %u", (unsigned) rc);
>> +        LOG_WA("saClmClusterNodeGet_4 returned %d", rc);
>>       }
>>         if ((rc = saClmFinalize(fm_cb->clm_hdl)) != SA_AIS_OK) {
>> @@ -551,6 +572,58 @@ static SaAisErrorT get_peer_clm_node_nam
>>   *
>>   * Notes         : None.
>> *****************************************************************************/
>> +static bool is_node_clm_member(const SaNameT *clm_node_name)
>> +{
>> +    SaAisErrorT rc = SA_AIS_OK;
>> +    SaUint32T clm_member = 0;
>> +    SaNameT node_name = *clm_node_name;
>> +
>> +    SaVersionT immVersion = { 'A', 2, 15 };
>> +    const SaImmAttrValuesT_2 **attributes;
>> +    SaImmAccessorHandleT accessor_handle;
>> +    SaImmHandleT om_handle;
>> +
>> +    if ((rc = immutil_saImmOmInitialize(&om_handle, NULL, 
>> &immVersion)) != SA_AIS_OK) {
>> +        LOG_ER("saImmOmInitialize FAILED: %u", rc);
>> +        goto done;
>> +    }
>> +
>> +    if ((rc = immutil_saImmOmAccessorInitialize(om_handle, 
>> &accessor_handle)) != SA_AIS_OK) {
>> +        LOG_ER("saImmOmAccessorInitialize FAILED: %u", rc);
>> +        goto om_finalize;
>> +    }
>> +
>> +    if ((rc = immutil_saImmOmAccessorGet_2(accessor_handle, 
>> &node_name, NULL, (SaImmAttrValuesT_2 ***) &attributes)) != SA_AIS_OK) {
>> +        LOG_ER("saImmOmAccessorGet_2 FAILED: %s %u ", 
>> node_name.value, rc);
>> +        goto accessor_finalize;
>> +    }
>> +
>> +    if ((rc = immutil_getAttr("saClmNodeIsMember", attributes, 0, 
>> &clm_member)) != SA_AIS_OK) {
>> +        LOG_ER("immutil_getAttr FAILED: %u", rc);
>> +    }
>> +accessor_finalize:
>> +    if ((rc = immutil_saImmOmAccessorFinalize(accessor_handle)) != 
>> SA_AIS_OK) {
>> +        LOG_NO("immutil_saImmOmAccessorFinalize FAILED: %u", rc);
>> +    }
>> +om_finalize:
>> +    if ((rc = immutil_saImmOmFinalize(om_handle)) != SA_AIS_OK) {
>> +        LOG_NO("immutil_saImmOmFinalize FAILED: %u", rc);
>> +    }
>> +done:
>> +    return (clm_member == 1) ? true : false;
>> +}
>> +
>> +/****************************************************************************
>>  
>>
>> +* Name          : fm_clm_init
>> +*
>> +* Description   : Initialize CLM.
>> +*
>> +* Arguments     : None.
>> +*
>> +* Return Values : None.
>> +*
>> +* Notes         : None.
>> +*****************************************************************************/
>>  
>>
>>   static SaAisErrorT fm_clm_init()
>>   {
>>       SaAisErrorT rc = SA_AIS_OK;
>> @@ -622,8 +695,15 @@ static void fm_mbx_msg_handler(FM_CB *fm
>>                        * node_down event has been received.
>>                        */
>>                   if (fm_cb->use_remote_fencing) {
>> -                    opensaf_reboot(fm_cb->peer_node_id, (char 
>> *)fm_cb->peer_clm_node_name.value,
>> -                            "Received Node Down for peer controller");
>> +                    const char* clm_node_name = 
>> get_clm_node_name(&fm_cb->peer_clm_node_name);
>> +                    if 
>> (is_node_clm_member(&fm_cb->peer_clm_node_name)) {
>> +                        opensaf_reboot(fm_cb->peer_node_id, 
>> clm_node_name,
>> +                                "Received Node Down for peer 
>> controller");
>> +                    } else {
>> +                        LOG_NO("Peer node %s is not a member of the 
>> CLM cluster, fencing will not be performed",
>> +                                clm_node_name);
>> +                    }
>> +                    free((char*)clm_node_name);
>>                   } else {
>>                       opensaf_reboot(fm_cb->peer_node_id, (char 
>> *)fm_cb->peer_node_name.value,
>>                               "Received Node Down for peer controller");
>> @@ -661,11 +741,15 @@ static void fm_mbx_msg_handler(FM_CB *fm
>>                 LOG_NO("Reseting peer controller node id: %x", 
>> fm_cb->peer_node_id);
>>               if (fm_cb->use_remote_fencing) {
>> -                LOG_NO("saClmClusterNodeGet succeeded node_id 0x%X, 
>> clm peer node name %s",
>> -                    fm_mbx_evt->node_id, 
>> fm_cb->peer_clm_node_name.value);
>> -
>> -                opensaf_reboot(fm_cb->peer_node_id, (char 
>> *)fm_cb->peer_clm_node_name.value,
>> -                        "Received Node Down for peer controller");
>> +                const char* clm_node_name = 
>> get_clm_node_name(&fm_cb->peer_clm_node_name);
>> +                if (is_node_clm_member(&fm_cb->peer_clm_node_name)) {
>> +                    opensaf_reboot(fm_cb->peer_node_id, clm_node_name,
>> +                            "Received Node Down for peer controller");
>> +                } else {
>> +                    LOG_NO("Peer node %s is not a member of the CLM 
>> cluster, fencing will not be performed",
>> +                            clm_node_name);
>> +                }
>> +                free((char*)clm_node_name);
>>               } else {
>>                   opensaf_reboot(fm_cb->peer_node_id, (char 
>> *)fm_cb->peer_node_name.value,
>>                              "Received Node Down for Active peer");
>


------------------------------------------------------------------------------
Developer Access Program for Intel Xeon Phi Processors
Access to Intel Xeon Phi processor-based developer platforms.
With one year of Intel Parallel Studio XE.
Training and support from Colfax.
Order your platform today. http://sdm.link/xeonphi
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to