Ack, Not tested (Because I don't know have the testbed :-)) Mathi.
> -----Original Message----- > From: Hans Nordeback [mailto:hans.nordeb...@ericsson.com] > Sent: Wednesday, November 23, 2016 7:15 PM > To: Ramesh Babu Betham; Mathivanan Naickan Palanivelu; > anders.wid...@ericsson.com > Cc: opensaf-devel@lists.sourceforge.net > Subject: [PATCH 1 of 1] fm: Add support for differentiating a hung node > versus a stopped node V3 [#2160] > > osaf/services/infrastructure/fm/fms/fm_cb.h | 1 + > osaf/services/infrastructure/fm/fms/fm_evt.h | 1 + > osaf/services/infrastructure/fm/fms/fm_main.c | 54 > +++++++++++++++++++++++--- > osaf/services/infrastructure/fm/fms/fm_mds.c | 12 +++++- > 4 files changed, 59 insertions(+), 9 deletions(-) > > > diff --git a/osaf/services/infrastructure/fm/fms/fm_cb.h > b/osaf/services/infrastructure/fm/fms/fm_cb.h > --- a/osaf/services/infrastructure/fm/fms/fm_cb.h > +++ b/osaf/services/infrastructure/fm/fms/fm_cb.h > @@ -106,6 +106,7 @@ typedef struct fm_cb { > SaClmHandleT clm_hdl; > bool use_remote_fencing; > SaNameT peer_clm_node_name; > + bool peer_node_terminated; > } FM_CB; > > extern char *role_string[]; > diff --git a/osaf/services/infrastructure/fm/fms/fm_evt.h > b/osaf/services/infrastructure/fm/fms/fm_evt.h > --- a/osaf/services/infrastructure/fm/fms/fm_evt.h > +++ b/osaf/services/infrastructure/fm/fms/fm_evt.h > @@ -21,6 +21,7 @@ > /* EVT from other GFM over MDS.*/ > typedef enum { > GFM_GFM_EVT_NODE_INFO_EXCHANGE, > + GFM_GFM_EVT_PEER_IS_TERMINATING, > GFM_GFM_EVT_MAX > } GFM_GFM_MSG_TYPE; > > diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c > b/osaf/services/infrastructure/fm/fms/fm_main.c > --- a/osaf/services/infrastructure/fm/fms/fm_main.c > +++ b/osaf/services/infrastructure/fm/fms/fm_main.c > @@ -59,6 +59,7 @@ char *role_string[] = { "UNDEFINED", "AC static uint32_t > fm_agents_startup(void); static uint32_t fm_get_args(FM_CB *); static > uint32_t fms_fms_exchange_node_info(FM_CB *); > +static uint32_t fms_fms_inform_terminating(FM_CB *fm_cb); > static uint32_t fm_nid_notify(uint32_t); static uint32_t > fm_tmr_start(FM_TMR *, SaTimeT); static SaAisErrorT > get_peer_clm_node_name(NODE_ID); @@ -280,6 +281,7 @@ int main(int > argc, char *argv[]) > } > > if (fds[FD_TERM].revents & POLLIN) { > + fms_fms_inform_terminating(fm_cb); > daemon_exit(); > } > > @@ -622,8 +624,12 @@ static void fm_mbx_msg_handler(FM_CB *fm > * node_down event has been > received. > */ > if (fm_cb->use_remote_fencing) { > - opensaf_reboot(fm_cb- > >peer_node_id, (char *)fm_cb->peer_clm_node_name.value, > - "Received Node > Down for peer controller"); > + if (fm_cb->peer_node_terminated > == false) { > + opensaf_reboot(fm_cb- > >peer_node_id, (char *)fm_cb->peer_clm_node_name.value, > + "Received > Node Down for peer controller"); > + } else { > + LOG_NO("Peer node %s is > terminated, fencing will not be performed", fm_cb- > >peer_clm_node_name.value); > + } > } else { > opensaf_reboot(fm_cb- > >peer_node_id, (char *)fm_cb->peer_node_name.value, > "Received Node > Down for peer controller"); @@ -661,11 +667,12 @@ static void > fm_mbx_msg_handler(FM_CB *fm > > LOG_NO("Reseting peer controller node id: %x", > fm_cb->peer_node_id); > if (fm_cb->use_remote_fencing) { > - LOG_NO("saClmClusterNodeGet succeeded > node_id 0x%X, clm peer node name %s", > - fm_mbx_evt->node_id, fm_cb- > >peer_clm_node_name.value); > - > - opensaf_reboot(fm_cb->peer_node_id, > (char *)fm_cb->peer_clm_node_name.value, > - "Received Node Down for > peer controller"); > + if (fm_cb->peer_node_terminated == false) { > + opensaf_reboot(fm_cb- > >peer_node_id, (char *)fm_cb->peer_clm_node_name.value, > + "Received Node > Down for peer controller"); > + } else { > + LOG_NO("Peer node %s is > terminated, fencing will not be performed", fm_cb- > >peer_clm_node_name.value); > + } > } else { > opensaf_reboot(fm_cb->peer_node_id, > (char *)fm_cb->peer_node_name.value, > "Received Node Down for Active > peer"); @@ -868,6 +875,39 @@ static uint32_t fms_fms_exchange_node_in > } > > > /********************************************************** > ****************** > +* Name : fms_fms_inform_terminating > +* > +* Description : sends information to peer that terminating is undergoing. > +* > +* Arguments : Pointer to Control Block. > +* > +* Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. > +* > +* Notes : None. > +********************************************************* > ************** > +******/ static uint32_t fms_fms_inform_terminating(FM_CB *fm_cb) { > + GFM_GFM_MSG gfm_msg; > + TRACE_ENTER(); > + if (fm_cb->peer_adest != 0) { > +/* peer fms present */ > + memset(&gfm_msg, 0, sizeof(GFM_GFM_MSG)); > + gfm_msg.msg_type = > GFM_GFM_EVT_PEER_IS_TERMINATING; > + > + if (NCSCC_RC_SUCCESS != fm_mds_async_send(fm_cb, > (NCSCONTEXT)&gfm_msg, > + > NCSMDS_SVC_ID_GFM, MDS_SEND_PRIORITY_VERY_HIGH, > + 0, fm_cb- > >peer_adest, 0)) { > + syslog(LOG_ERR, "Sending node-info message to > peer fms failed"); > + return NCSCC_RC_FAILURE; > + } > + > + return NCSCC_RC_SUCCESS; > + } > + TRACE_LEAVE(); > + return NCSCC_RC_FAILURE; > +} > + > +/********************************************************* > ************* > +****** > * Name : fm_nid_notify > * > * Description : Sends notification to NID > diff --git a/osaf/services/infrastructure/fm/fms/fm_mds.c > b/osaf/services/infrastructure/fm/fms/fm_mds.c > --- a/osaf/services/infrastructure/fm/fms/fm_mds.c > +++ b/osaf/services/infrastructure/fm/fms/fm_mds.c > @@ -474,6 +474,7 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb > } > cb->peer_adest = svc_evt->i_dest; > cb->peer_node_id = svc_evt->i_node_id; > + cb->peer_node_terminated = false; > return_val = > fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, cb->peer_node_id, > FM_EVT_PEER_UP); > > if (NCSCC_RC_FAILURE == return_val) { @@ - > 533,7 +534,9 @@ static uint32_t fm_mds_rcv_evt(FM_CB *cb > cb->peer_node_name.length); > LOG_IN("Peer Node_id %u : EE_ID %s", cb- > >peer_node_id, cb->peer_node_name.value); > break; > - > + case GFM_GFM_EVT_PEER_IS_TERMINATING: > + fm_cb->peer_node_terminated = true; > + break; > default: > syslog(LOG_INFO, "Wrong MDS event from GFM."); > return_val = NCSCC_RC_FAILURE; > @@ -768,7 +771,9 @@ static uint32_t fm_fm_mds_enc(MDS_CALLBA > ncs_encode_n_octets_in_uba(uba, msg- > >info.node_info.node_name.value, > (uint32_t)msg- > >info.node_info.node_name.length); > break; > - > + case GFM_GFM_EVT_PEER_IS_TERMINATING: > + fm_cb->peer_node_terminated = true; > + break; > default: > syslog(LOG_INFO, "fm_fm_mds_enc: Invalid msg type for > encode."); > return m_LEAP_DBG_SINK(NCSCC_RC_FAILURE); > @@ -830,6 +835,9 @@ static uint32_t fm_fm_mds_dec(MDS_CALLBA > ncs_decode_n_octets_from_uba(uba, msg- > >info.node_info.node_name.value, > msg- > >info.node_info.node_name.length); > break; > + case GFM_GFM_EVT_PEER_IS_TERMINATING: > + fm_cb->peer_node_terminated = true; > + break; > default: > syslog(LOG_INFO, "fm_fm_mds_dec: Invalid msg for > decoding."); > return m_LEAP_DBG_SINK(NCSCC_RC_FAILURE); ------------------------------------------------------------------------------ Developer Access Program for Intel Xeon Phi Processors Access to Intel Xeon Phi processor-based developer platforms. With one year of Intel Parallel Studio XE. Training and support from Colfax. Order your platform today.http://sdm.link/xeonphi _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel