Hi!

This patch is actually identical to the prototype code that I wrote and 
attached to the ticket, so I am not sure if I am supposed to also review 
it... anyways it is ack from from me for the first patch. :-)

regards,
Anders Widell

On 06/23/2016 07:31 AM, Hans Nordeback wrote:
>   osaf/services/infrastructure/fm/fms/fm_cb.h   |  10 +++++
>   osaf/services/infrastructure/fm/fms/fm_main.c |  16 +++++++-
>   osaf/services/infrastructure/fm/fms/fm_mds.c  |  51 
> +++++++++++++++++++++++++++
>   3 files changed, 75 insertions(+), 2 deletions(-)
>
>
> In situations where remote fencing is not possible, this patch adds support 
> for self-fencing.
>
> diff --git a/osaf/services/infrastructure/fm/fms/fm_cb.h 
> b/osaf/services/infrastructure/fm/fms/fm_cb.h
> --- a/osaf/services/infrastructure/fm/fms/fm_cb.h
> +++ b/osaf/services/infrastructure/fm/fms/fm_cb.h
> @@ -27,6 +27,10 @@
>   #include "rda_papi.h"
>   #include "fm_amf.h"
>   
> +#include <stdbool.h>
> +#include <stdint.h>
> +#include <time.h>
> +
>   uint32_t gl_fm_hdl;
>   
>   typedef enum {
> @@ -92,6 +96,12 @@ typedef struct fm_cb {
>       bool amfnd_down;
>       bool amfd_down;
>       bool fm_down;
> +
> +     bool peer_sc_up;
> +     bool well_connected;
> +     uint64_t cluster_size;
> +     struct timespec last_well_connected;
> +     struct timespec node_isolation_timeout;
>   } FM_CB;
>   
>   extern char *role_string[];
> diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c 
> b/osaf/services/infrastructure/fm/fms/fm_main.c
> --- a/osaf/services/infrastructure/fm/fms/fm_main.c
> +++ b/osaf/services/infrastructure/fm/fms/fm_main.c
> @@ -30,7 +30,7 @@ This file contains the main() routine fo
>   
>   #include <nid_api.h>
>   #include "fm.h"
> -
> +#include "osaf_time.h"
>   
>   enum {
>       FD_TERM = 0,
> @@ -411,7 +411,19 @@ static uint32_t fm_get_args(FM_CB *fm_cb
>       fm_cb->promote_active_tmr.type = FM_TMR_PROMOTE_ACTIVE;
>       fm_cb->activation_supervision_tmr.type = FM_TMR_ACTIVATION_SUPERVISION;
>   
> -     TRACE_LEAVE();
> +     char* node_isolation_timeout = getenv("FMS_NODE_ISOLATION_TIMEOUT");
> +     if (node_isolation_timeout != NULL) {
> +             osaf_millis_to_timespec(atoi(node_isolation_timeout),
> +                                     &fm_cb->node_isolation_timeout);
> +     } else {
> +             fm_cb->node_isolation_timeout.tv_sec = 10;
> +             fm_cb->node_isolation_timeout.tv_nsec = 0;
> +     }
> +     TRACE("NODE_ISOLATION_TIMEOUT = %" PRId64 ".%09ld",
> +           (int64_t) fm_cb->node_isolation_timeout.tv_sec,
> +           fm_cb->node_isolation_timeout.tv_nsec);
> +
> +     TRACE_LEAVE();
>       return NCSCC_RC_SUCCESS;
>   }
>   
> diff --git a/osaf/services/infrastructure/fm/fms/fm_mds.c 
> b/osaf/services/infrastructure/fm/fms/fm_mds.c
> --- a/osaf/services/infrastructure/fm/fms/fm_mds.c
> +++ b/osaf/services/infrastructure/fm/fms/fm_mds.c
> @@ -16,6 +16,8 @@
>   */
>   
>   #include "fm.h"
> +#include "osaf_time.h"
> +#include "ncssysf_def.h"
>   
>   const MDS_CLIENT_MSG_FORMAT_VER fm_fm_msg_fmt_map_table[FM_SUBPART_VER_MAX] 
> = { FM_FM_MSG_FMT_VER_1 };
>   
> @@ -28,6 +30,8 @@ static uint32_t fm_encode(MDS_CALLBACK_E
>   static uint32_t fm_decode(MDS_CALLBACK_DEC_INFO *dec_info);
>   static uint32_t fm_fm_mds_enc(MDS_CALLBACK_ENC_INFO *enc_info);
>   static uint32_t fm_fm_mds_dec(MDS_CALLBACK_DEC_INFO *dec_info);
> +static void check_for_node_isolation(FM_CB *cb);
> +static bool has_been_well_connected_recently(FM_CB *cb);
>   static uint32_t fm_mds_node_evt(FM_CB *cb, MDS_CALLBACK_NODE_EVENT_INFO * 
> node_evt);
>   static uint32_t fm_fill_mds_evt_post_fm_mbx(FM_CB *cb, FM_EVT *fm_evt, 
> NODE_ID node_id, FM_FSM_EVT_CODE evt_code);
>   
> @@ -300,6 +304,27 @@ static void fm_send_svc_down_to_mbx(FM_C
>       return;
>   }
>   
> +static void check_for_node_isolation(FM_CB *cb)
> +{
> +     bool well_connected = cb->peer_sc_up && cb->cluster_size >= 3;
> +     if (cb->well_connected && !well_connected) {
> +             osaf_clock_gettime(CLOCK_MONOTONIC, &cb->last_well_connected);
> +     }
> +     cb->well_connected = well_connected;
> +}
> +
> +static bool has_been_well_connected_recently(FM_CB *cb)
> +{
> +     if (cb->well_connected) return true;
> +     struct timespec current;
> +     struct timespec difference;
> +     osaf_clock_gettime(CLOCK_MONOTONIC, &current);
> +     if (osaf_timespec_compare(&current, &cb->last_well_connected) < 0) 
> return false;
> +     osaf_timespec_subtract(&current, &cb->last_well_connected, &difference);
> +     if (osaf_timespec_compare(&difference, &cb->node_isolation_timeout) < 
> 0) return true;
> +     return false;
> +}
> +
>   
> /****************************************************************************
>   * Name          : fm_mds_node_evt
>   *
> @@ -318,6 +343,20 @@ static uint32_t fm_mds_node_evt(FM_CB *c
>   
>       switch (node_evt->node_chg) {
>       case NCSMDS_NODE_DOWN:
> +             if (cb->cluster_size != 0) {
> +                     --cb->cluster_size;
> +                     TRACE("Node down event for node id %x, cluster size is 
> now: %llu",
> +                           node_evt->node_id, (unsigned long long) 
> cb->cluster_size);
> +                     check_for_node_isolation(cb);
> +                     if (cb->cluster_size == 1 && 
> has_been_well_connected_recently(cb)) {
> +                             opensaf_reboot(0, NULL,
> +                                             "Self-fencing due to sudden 
> loss of contact with the rest of the cluster");
> +                     }
> +             } else {
> +                     TRACE("Node down event for node id %x ignored", 
> node_evt->node_id);
> +                     LOG_ER("Received unexpected node down event for node id 
> %x", node_evt->node_id);
> +             }
> +
>               if (node_evt->node_id == cb->peer_node_id && cb->control_tipc) {
>                       /* Process NODE_DOWN only if OpenSAF is controling TIPC 
> */
>                       LOG_NO("Node Down event for node id %x:", 
> node_evt->node_id);
> @@ -326,6 +365,10 @@ static uint32_t fm_mds_node_evt(FM_CB *c
>               break;
>   
>       case NCSMDS_NODE_UP:
> +             ++cb->cluster_size;
> +             TRACE("Node up event for node id %x, cluster size is now: %llu",
> +                   node_evt->node_id, (unsigned long long) cb->cluster_size);
> +             check_for_node_isolation(cb);
>               break;
>   
>       default:
> @@ -365,6 +408,10 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb
>                       /* Depend on service downs if OpenSAF is not controling 
> TIPC */
>                       case NCSMDS_SVC_ID_GFM:
>                               if (svc_evt->i_node_id == cb->peer_node_id) {
> +                                     TRACE("Peer fm status change: %d -> %d, 
> peer node id is: %x, cluster size is %llu",
> +                                           (int) cb->peer_sc_up, 0, 
> svc_evt->i_node_id, (unsigned long long) cb->cluster_size);
> +                                     cb->peer_sc_up = false;
> +                                     check_for_node_isolation(cb);
>                                       cb->peer_adest = 0;
>                                       if (!cb->control_tipc) {
>                                               fm_send_svc_down_to_mbx(cb, 
> svc_evt->i_node_id, svc_evt->i_svc_id);
> @@ -415,6 +462,10 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb
>               switch (svc_evt->i_svc_id) {
>               case NCSMDS_SVC_ID_GFM:
>                       if ((svc_evt->i_node_id != cb->node_id) && 
> (m_MDS_DEST_IS_AN_ADEST(svc_evt->i_dest) == true)) {
> +                             TRACE("Peer fm status change: %d -> %d, peer 
> node id is: %x, cluster size is %llu",
> +                                   (int) cb->peer_sc_up, 1, 
> svc_evt->i_node_id, (unsigned long long) cb->cluster_size);
> +                             cb->peer_sc_up = true;
> +                             check_for_node_isolation(cb);
>   
>                               fm_evt = m_MMGR_ALLOC_FM_EVT;
>                               if (NULL == fm_evt) {


------------------------------------------------------------------------------
Attend Shape: An AT&T Tech Expo July 15-16. Meet us at AT&T Park in San
Francisco, CA to explore cutting-edge tech and listen to tech luminaries
present their vision of the future. This family event has something for
everyone, including kids. Get more information and register today.
http://sdm.link/attshape
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to