Re: [devel] [PATCH 1 of 2] fm: Add support for self-fencing [#1859]

Anders Widell Tue, 12 Jul 2016 06:38:07 -0700

One comment: in the prototype patch the feature was on by default, but 
it ought to be off by default when we introduce this feature officially.


/ Anders Widell

On 06/30/2016 10:32 AM, Anders Widell wrote:
> Hi!
>
> This patch is actually identical to the prototype code that I wrote 
> and attached to the ticket, so I am not sure if I am supposed to also 
> review it... anyways it is ack from from me for the first patch. :-)
>
> regards,
> Anders Widell
>
> On 06/23/2016 07:31 AM, Hans Nordeback wrote:
>> osaf/services/infrastructure/fm/fms/fm_cb.h   |  10 +++++
>>   osaf/services/infrastructure/fm/fms/fm_main.c |  16 +++++++-
>>   osaf/services/infrastructure/fm/fms/fm_mds.c  |  51 
>> +++++++++++++++++++++++++++
>>   3 files changed, 75 insertions(+), 2 deletions(-)
>>
>>
>> In situations where remote fencing is not possible, this patch adds 
>> support for self-fencing.
>>
>> diff --git a/osaf/services/infrastructure/fm/fms/fm_cb.h 
>> b/osaf/services/infrastructure/fm/fms/fm_cb.h
>> --- a/osaf/services/infrastructure/fm/fms/fm_cb.h
>> +++ b/osaf/services/infrastructure/fm/fms/fm_cb.h
>> @@ -27,6 +27,10 @@
>>   #include "rda_papi.h"
>>   #include "fm_amf.h"
>>   +#include <stdbool.h>
>> +#include <stdint.h>
>> +#include <time.h>
>> +
>>   uint32_t gl_fm_hdl;
>>     typedef enum {
>> @@ -92,6 +96,12 @@ typedef struct fm_cb {
>>       bool amfnd_down;
>>       bool amfd_down;
>>       bool fm_down;
>> +
>> +    bool peer_sc_up;
>> +    bool well_connected;
>> +    uint64_t cluster_size;
>> +    struct timespec last_well_connected;
>> +    struct timespec node_isolation_timeout;
>>   } FM_CB;
>>     extern char *role_string[];
>> diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c 
>> b/osaf/services/infrastructure/fm/fms/fm_main.c
>> --- a/osaf/services/infrastructure/fm/fms/fm_main.c
>> +++ b/osaf/services/infrastructure/fm/fms/fm_main.c
>> @@ -30,7 +30,7 @@ This file contains the main() routine fo
>>     #include <nid_api.h>
>>   #include "fm.h"
>> -
>> +#include "osaf_time.h"
>>     enum {
>>       FD_TERM = 0,
>> @@ -411,7 +411,19 @@ static uint32_t fm_get_args(FM_CB *fm_cb
>>       fm_cb->promote_active_tmr.type = FM_TMR_PROMOTE_ACTIVE;
>>       fm_cb->activation_supervision_tmr.type = 
>> FM_TMR_ACTIVATION_SUPERVISION;
>>   -      TRACE_LEAVE();
>> +    char* node_isolation_timeout = 
>> getenv("FMS_NODE_ISOLATION_TIMEOUT");
>> +    if (node_isolation_timeout != NULL) {
>> +        osaf_millis_to_timespec(atoi(node_isolation_timeout),
>> +                    &fm_cb->node_isolation_timeout);
>> +    } else {
>> +        fm_cb->node_isolation_timeout.tv_sec = 10;
>> +        fm_cb->node_isolation_timeout.tv_nsec = 0;
>> +    }
>> +    TRACE("NODE_ISOLATION_TIMEOUT = %" PRId64 ".%09ld",
>> +          (int64_t) fm_cb->node_isolation_timeout.tv_sec,
>> +          fm_cb->node_isolation_timeout.tv_nsec);
>> +
>> +    TRACE_LEAVE();
>>       return NCSCC_RC_SUCCESS;
>>   }
>>   diff --git a/osaf/services/infrastructure/fm/fms/fm_mds.c 
>> b/osaf/services/infrastructure/fm/fms/fm_mds.c
>> --- a/osaf/services/infrastructure/fm/fms/fm_mds.c
>> +++ b/osaf/services/infrastructure/fm/fms/fm_mds.c
>> @@ -16,6 +16,8 @@
>>   */
>>     #include "fm.h"
>> +#include "osaf_time.h"
>> +#include "ncssysf_def.h"
>>     const MDS_CLIENT_MSG_FORMAT_VER 
>> fm_fm_msg_fmt_map_table[FM_SUBPART_VER_MAX] = { FM_FM_MSG_FMT_VER_1 };
>>   @@ -28,6 +30,8 @@ static uint32_t fm_encode(MDS_CALLBACK_E
>>   static uint32_t fm_decode(MDS_CALLBACK_DEC_INFO *dec_info);
>>   static uint32_t fm_fm_mds_enc(MDS_CALLBACK_ENC_INFO *enc_info);
>>   static uint32_t fm_fm_mds_dec(MDS_CALLBACK_DEC_INFO *dec_info);
>> +static void check_for_node_isolation(FM_CB *cb);
>> +static bool has_been_well_connected_recently(FM_CB *cb);
>>   static uint32_t fm_mds_node_evt(FM_CB *cb, 
>> MDS_CALLBACK_NODE_EVENT_INFO * node_evt);
>>   static uint32_t fm_fill_mds_evt_post_fm_mbx(FM_CB *cb, FM_EVT 
>> *fm_evt, NODE_ID node_id, FM_FSM_EVT_CODE evt_code);
>>   @@ -300,6 +304,27 @@ static void fm_send_svc_down_to_mbx(FM_C
>>       return;
>>   }
>>   +static void check_for_node_isolation(FM_CB *cb)
>> +{
>> +    bool well_connected = cb->peer_sc_up && cb->cluster_size >= 3;
>> +    if (cb->well_connected && !well_connected) {
>> +        osaf_clock_gettime(CLOCK_MONOTONIC, &cb->last_well_connected);
>> +    }
>> +    cb->well_connected = well_connected;
>> +}
>> +
>> +static bool has_been_well_connected_recently(FM_CB *cb)
>> +{
>> +    if (cb->well_connected) return true;
>> +    struct timespec current;
>> +    struct timespec difference;
>> +    osaf_clock_gettime(CLOCK_MONOTONIC, &current);
>> +    if (osaf_timespec_compare(&current, &cb->last_well_connected) < 
>> 0) return false;
>> +    osaf_timespec_subtract(&current, &cb->last_well_connected, 
>> &difference);
>> +    if (osaf_timespec_compare(&difference, 
>> &cb->node_isolation_timeout) < 0) return true;
>> +    return false;
>> +}
>> +
>> /****************************************************************************
>>   * Name          : fm_mds_node_evt
>>   *
>> @@ -318,6 +343,20 @@ static uint32_t fm_mds_node_evt(FM_CB *c
>>         switch (node_evt->node_chg) {
>>       case NCSMDS_NODE_DOWN:
>> +        if (cb->cluster_size != 0) {
>> +            --cb->cluster_size;
>> +            TRACE("Node down event for node id %x, cluster size is 
>> now: %llu",
>> +                  node_evt->node_id, (unsigned long long) 
>> cb->cluster_size);
>> +            check_for_node_isolation(cb);
>> +            if (cb->cluster_size == 1 && 
>> has_been_well_connected_recently(cb)) {
>> +                opensaf_reboot(0, NULL,
>> +                        "Self-fencing due to sudden loss of contact 
>> with the rest of the cluster");
>> +            }
>> +        } else {
>> +            TRACE("Node down event for node id %x ignored", 
>> node_evt->node_id);
>> +            LOG_ER("Received unexpected node down event for node id 
>> %x", node_evt->node_id);
>> +        }
>> +
>>           if (node_evt->node_id == cb->peer_node_id && 
>> cb->control_tipc) {
>>               /* Process NODE_DOWN only if OpenSAF is controling TIPC */
>>               LOG_NO("Node Down event for node id %x:", 
>> node_evt->node_id);
>> @@ -326,6 +365,10 @@ static uint32_t fm_mds_node_evt(FM_CB *c
>>           break;
>>         case NCSMDS_NODE_UP:
>> +        ++cb->cluster_size;
>> +        TRACE("Node up event for node id %x, cluster size is now: 
>> %llu",
>> +              node_evt->node_id, (unsigned long long) 
>> cb->cluster_size);
>> +        check_for_node_isolation(cb);
>>           break;
>>         default:
>> @@ -365,6 +408,10 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb
>>               /* Depend on service downs if OpenSAF is not controling 
>> TIPC */
>>               case NCSMDS_SVC_ID_GFM:
>>                   if (svc_evt->i_node_id == cb->peer_node_id) {
>> +                    TRACE("Peer fm status change: %d -> %d, peer 
>> node id is: %x, cluster size is %llu",
>> +                          (int) cb->peer_sc_up, 0, 
>> svc_evt->i_node_id, (unsigned long long) cb->cluster_size);
>> +                    cb->peer_sc_up = false;
>> +                    check_for_node_isolation(cb);
>>                       cb->peer_adest = 0;
>>                       if (!cb->control_tipc) {
>>                           fm_send_svc_down_to_mbx(cb, 
>> svc_evt->i_node_id, svc_evt->i_svc_id);
>> @@ -415,6 +462,10 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb
>>           switch (svc_evt->i_svc_id) {
>>           case NCSMDS_SVC_ID_GFM:
>>               if ((svc_evt->i_node_id != cb->node_id) && 
>> (m_MDS_DEST_IS_AN_ADEST(svc_evt->i_dest) == true)) {
>> +                TRACE("Peer fm status change: %d -> %d, peer node id 
>> is: %x, cluster size is %llu",
>> +                      (int) cb->peer_sc_up, 1, svc_evt->i_node_id, 
>> (unsigned long long) cb->cluster_size);
>> +                cb->peer_sc_up = true;
>> +                check_for_node_isolation(cb);
>>                     fm_evt = m_MMGR_ALLOC_FM_EVT;
>>                   if (NULL == fm_evt) {
>


------------------------------------------------------------------------------
What NetFlow Analyzer can do for you? Monitors network bandwidth and traffic
patterns at an interface-level. Reveals which users, apps, and protocols are 
consuming the most bandwidth. Provides multi-vendor support for NetFlow, 
J-Flow, sFlow and other flows. Make informed decisions using capacity planning
reports.http://sdm.link/zohodev2dev
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1 of 2] fm: Add support for self-fencing [#1859]

Reply via email to