Hi! I am getting failed regression tests after applying this patch. Haven't analyzed the cause yet, though.
If you have a way to reproduce the original problem reported in the ticket, could you try my suggestion of not removing the active role from RDE until AMF has completely shut down? Maybe it would be as simple as: diff --git a/src/rde/rded/rde_amf.cc b/src/rde/rded/rde_amf.cc --- a/src/rde/rded/rde_amf.cc +++ b/src/rde/rded/rde_amf.cc @@ -65,10 +65,6 @@ void rde_saf_CSI_rem_callback(SaInvocati TRACE_ENTER2("current role: %d", static_cast<int>(rde_amf_cb->role->role())); uint32_t rc = NCSCC_RC_SUCCESS; SaAisErrorT error = SA_AIS_OK; - if ((rc = rde_amf_cb->role->SetRole(PCS_RDA_QUIESCED)) != NCSCC_RC_SUCCESS) { - LOG_ER("SetRole failed %u", (unsigned) rc); - error = SA_AIS_ERR_FAILED_OPERATION; - } error = saAmfResponse(rde_amf_cb->amf_hdl, invocation, error); TRACE_LEAVE2("rc = %u, error = %d", (unsigned) rc, (int) error); } regards, Anders Widell On 02/24/2017 12:34 PM, ramesh betham wrote: > I don't think RED_UP always come first or later MDS_UP or vice-versa. > Most likley it depends on the sequence of svc registrations happens > with MDS. > > Thanks, > Ramesh. > > On 2/24/2017 4:48 PM, praveen malviya wrote: >> >> >> On 24-Feb-17 4:07 PM, ramesh betham wrote: >>> Good catch. Hitting the case of fm_peer_down_wait() is very unlikely. >>> >>> But here fm_peer_down_wait() is called only before fm nid_notifies and >>> considering for amfnd-up event too. A rare and race condition can hit >>> where fm on upcoming new active receives fm-down event and amfnd is >>> still alive. >>> >> But the if block where cb->amfnd_down is marked false assumes that >> cb->peer_node_id is already set in RED_UP events of IMMD or AVD. Is >> there any guarantee from MDS that RED_UP event will always come >> before normal MDS_UP event? >> >> Thanks, >> Praveen >> >>> Thanks, >>> Ramesh. >>> >>> On 2/24/2017 2:18 PM, praveen malviya wrote: >>>> Hi Ramesh, >>>> >>>> One minor query: >>>> In RED_UP of peer AVD, newly active SC will reboot itself if peer FM >>>> on old active SC is not up. If this true then in which situations >>>> newly active SC will wait in fm_peer_down_wait(). >>>> >>>> Thanks, >>>> Praveen >>>> >>>> >>>> On 22-Feb-17 5:00 PM, ramesh.bet...@oracle.com wrote: >>>>> src/fm/fmd/fm_cb.h | 3 + >>>>> src/fm/fmd/fm_evt.h | 2 +- >>>>> src/fm/fmd/fm_main.c | 114 +++++++++++++++++--------------- >>>>> src/fm/fmd/fm_mds.c | 173 >>>>> +++++++++++++++++++++++++++++++++++--------------- >>>>> 4 files changed, 186 insertions(+), 106 deletions(-) >>>>> >>>>> >>>>> This patch addresses the specific scenario where the new Active is >>>>> coming up and has discovered the afmd process on the peer node (which >>>>> is going down) is still alive. Here the peer amfd/amfnd is still in >>>>> the process of going down i.e., progressing in termination of >>>>> application components having big timeouts etc. >>>>> >>>>> diff --git a/src/fm/fmd/fm_cb.h b/src/fm/fmd/fm_cb.h >>>>> --- a/src/fm/fmd/fm_cb.h >>>>> +++ b/src/fm/fmd/fm_cb.h >>>>> @@ -1,6 +1,7 @@ >>>>> /* -*- OpenSAF -*- >>>>> * >>>>> * (C) Copyright 2008 The OpenSAF Foundation >>>>> +* Copyright (C) 2017, Oracle and/or its affiliates. All rights >>>>> reserved. >>>>> * >>>>> * This program is distributed in the hope that it will be useful, >>>>> but >>>>> * WITHOUT ANY WARRANTY; without even the implied warranty of >>>>> MERCHANTABILITY >>>>> @@ -107,6 +108,8 @@ typedef struct fm_cb { >>>>> bool use_remote_fencing; >>>>> SaNameT peer_clm_node_name; >>>>> bool peer_node_terminated; >>>>> + NCS_SEL_OBJ peer_down_obj; >>>>> + int peer_down_await; >>>>> } FM_CB; >>>>> >>>>> extern char *role_string[]; >>>>> diff --git a/src/fm/fmd/fm_evt.h b/src/fm/fmd/fm_evt.h >>>>> --- a/src/fm/fmd/fm_evt.h >>>>> +++ b/src/fm/fmd/fm_evt.h >>>>> @@ -1,6 +1,7 @@ >>>>> /* -*- OpenSAF -*- >>>>> * >>>>> * (C) Copyright 2008 The OpenSAF Foundation >>>>> +* Copyright (C) 2017, Oracle and/or its affiliates. All rights >>>>> reserved. >>>>> * >>>>> * This program is distributed in the hope that it will be useful, >>>>> but >>>>> * WITHOUT ANY WARRANTY; without even the implied warranty of >>>>> MERCHANTABILITY >>>>> @@ -49,7 +50,6 @@ typedef enum { >>>>> FM_EVT_NODE_DOWN, >>>>> FM_EVT_PEER_UP, >>>>> FM_EVT_RDA_ROLE, >>>>> - FM_EVT_SVC_DOWN, >>>>> FM_FSM_EVT_MAX >>>>> } FM_FSM_EVT_CODE; >>>>> >>>>> diff --git a/src/fm/fmd/fm_main.c b/src/fm/fmd/fm_main.c >>>>> --- a/src/fm/fmd/fm_main.c >>>>> +++ b/src/fm/fmd/fm_main.c >>>>> @@ -1,6 +1,7 @@ >>>>> /* -*- OpenSAF -*- >>>>> * >>>>> * (C) Copyright 2008 The OpenSAF Foundation >>>>> +* Copyright (C) 2017, Oracle and/or its affiliates. All rights >>>>> reserved. >>>>> * >>>>> * This program is distributed in the hope that it will be useful, >>>>> but >>>>> * WITHOUT ANY WARRANTY; without even the implied warranty of >>>>> MERCHANTABILITY >>>>> @@ -31,6 +32,7 @@ This file contains the main() routine fo >>>>> #include "nid/agent/nid_api.h" >>>>> #include "fm.h" >>>>> #include "base/osaf_time.h" >>>>> +#include "base/osaf_poll.h" >>>>> >>>>> #define FM_CLM_API_TIMEOUT 10000000000LL >>>>> >>>>> @@ -71,7 +73,6 @@ void handle_mbx_event(void); >>>>> extern uint32_t fm_amf_init(FM_AMF_CB *fm_amf_cb); >>>>> uint32_t gl_fm_hdl; >>>>> static NCS_SEL_OBJ usr1_sel_obj; >>>>> -void fm_proc_svc_down(FM_CB *cb, FM_EVT *fm_mbx_evt); >>>>> >>>>> /** >>>>> * USR1 signal is used when AMF wants instantiate us as a >>>>> @@ -119,6 +120,40 @@ static void rda_cb(uint32_t cb_hdl, PCS_ >>>>> TRACE_LEAVE(); >>>>> } >>>>> >>>>> +/* This function primarily handles the weird situation in a cluster >>>>> where the controller >>>>> + * node which is coming up identifies the peer node is in the midst >>>>> of DOWN process (i.e., >>>>> + * non-existance of peer FM and amfd/amfnd is still alive). In this >>>>> case, the controller >>>>> + * node has to wait till the peer gracefully shutdowns. This >>>>> function returns FAILURE if >>>>> + * peer controller node is not down in a timeout period of >>>>> OPENSAF_TERMTIMEOUT (or 60 secs default). >>>>> + */ >>>>> +static uint32_t fm_peer_down_wait(FM_CB *fm_cb) >>>>> +{ >>>>> + char *envVar = NULL; >>>>> + int peer_term_timeout = 60; /*default 60 secs */ >>>>> + >>>>> + TRACE_ENTER(); >>>>> + >>>>> + /* Hoping that "OPENSAF_TERMTIMEOUT" on both the controllers >>>>> shall be the same */ >>>>> + if ((envVar = getenv("OPENSAF_TERMTIMEOUT"))) >>>>> + peer_term_timeout = atoi(envVar); >>>>> + >>>>> + m_NCS_SEL_OBJ_CREATE(&fm_cb->peer_down_obj); >>>>> + fm_cb->peer_down_await = 1; >>>>> + >>>>> + osaf_poll_one_fd(m_GET_FD_FROM_SEL_OBJ(fm_cb->peer_down_obj), >>>>> peer_term_timeout*1000); >>>>> + >>>>> + m_NCS_SEL_OBJ_DESTROY(&fm_cb->peer_down_obj); >>>>> + >>>>> + /* Return failure if peer node is not yet completely down */ >>>>> + if(fm_cb->peer_down_await) { >>>>> + LOG_ER("Peer node is not fully DOWN, please check"); >>>>> + TRACE_LEAVE(); >>>>> + return NCSCC_RC_FAILURE; >>>>> + } >>>>> + >>>>> + TRACE_LEAVE(); >>>>> + return NCSCC_RC_SUCCESS; >>>>> +} >>>>> >>>>> >>>>> /***************************************************************************** >>>>> >>>>> >>>>> >>>>> >>>>> @@ -176,6 +211,11 @@ int main(int argc, char *argv[]) >>>>> */ >>>>> fm_cb->control_tipc = true; /* Default behaviour */ >>>>> >>>>> + fm_cb->immd_down = true; >>>>> + fm_cb->immnd_down = true; >>>>> + fm_cb->amfnd_down = true; >>>>> + fm_cb->amfd_down = true; >>>>> + >>>>> /* Create CB handle */ >>>>> gl_fm_hdl = ncshm_create_hdl(NCS_HM_POOL_ID_COMMON, >>>>> NCS_SERVICE_ID_GFM, (NCSCONTEXT)fm_cb); >>>>> >>>>> @@ -194,7 +234,7 @@ int main(int argc, char *argv[]) >>>>> goto fm_init_failed; >>>>> } >>>>> >>>>> -/* Attach MBX */ >>>>> + /* Attach MBX */ >>>>> if (m_NCS_IPC_ATTACH(&fm_cb->mbx) != NCSCC_RC_SUCCESS) { >>>>> syslog(LOG_ERR, "m_NCS_IPC_ATTACH() failed."); >>>>> goto fm_init_failed; >>>>> @@ -245,6 +285,16 @@ int main(int argc, char *argv[]) >>>>> goto fm_init_failed; >>>>> } >>>>> >>>>> + /* Weird and rare situation. If peer fm doesn't exist, but >>>>> amfd/amfnd process(es) >>>>> + * are still alive then wait till the peer gracefully shutsdown. >>>>> + */ >>>>> + if((!fm_cb->peer_sc_up) && !(fm_cb->amfnd_down && >>>>> fm_cb->amfd_down)) { >>>>> + if(fm_peer_down_wait(fm_cb) != NCSCC_RC_SUCCESS) { >>>>> + LOG_ER("Exiting.. Peer node is not completely >>>>> DOWN, please check"); >>>>> + goto fm_init_failed; >>>>> + } >>>>> + } >>>>> + >>>>> /* Get mailbox selection object */ >>>>> mbx_sel_obj = m_NCS_IPC_GET_SEL_OBJ(&fm_cb->mbx); >>>>> >>>>> @@ -268,7 +318,7 @@ int main(int argc, char *argv[]) >>>>> >>>>> /* notify the NID */ >>>>> if (nid_started) >>>>> - fm_nid_notify(NCSCC_RC_SUCCESS); >>>>> + fm_nid_notify((uint32_t) NCSCC_RC_SUCCESS); >>>>> >>>>> while (1) { >>>>> ret = poll(fds, nfds, -1); >>>>> @@ -454,52 +504,6 @@ static uint32_t fm_get_args(FM_CB *fm_cb >>>>> return NCSCC_RC_SUCCESS; >>>>> } >>>>> >>>>> -void fm_proc_svc_down(FM_CB *cb, FM_EVT *fm_mbx_evt) >>>>> -{ >>>>> - switch (fm_mbx_evt->svc_id) { >>>>> - case NCSMDS_SVC_ID_IMMND: >>>>> - cb->immnd_down = true; >>>>> - LOG_NO("IMMND down on: %x", cb->peer_node_id); >>>>> - break; >>>>> - case NCSMDS_SVC_ID_AVND: >>>>> - cb->amfnd_down = true; >>>>> - LOG_NO("AMFND down on: %x", cb->peer_node_id); >>>>> - break; >>>>> - case NCSMDS_SVC_ID_IMMD: >>>>> - cb->immd_down = true; >>>>> - LOG_NO("IMMD down on: %x", cb->peer_node_id); >>>>> - break; >>>>> - case NCSMDS_SVC_ID_AVD: >>>>> - cb->amfd_down = true; >>>>> - LOG_NO("AVD down on: %x", cb->peer_node_id); >>>>> - break; >>>>> - case NCSMDS_SVC_ID_GFM: >>>>> - cb->fm_down = true; >>>>> - LOG_NO("FM down on: %x", cb->peer_node_id); >>>>> - break; >>>>> - default: >>>>> - break; >>>>> - } >>>>> - >>>>> - /* Processing only for alternate node. >>>>> - * Service downs of AMFND, IMMD, IMMND is the same as NODE_DOWN >>>>> from 4.4 onwards. >>>>> - * This is required to handle the usecase involving >>>>> - * '/etc/init.d/opensafd stop' without an OS reboot cycle >>>>> - * Process service downs only if OpenSAF is not controlling TIPC. >>>>> - * If OpenSAF is controlling TIPC, just wait for NODE_DOWN to >>>>> trigger failover. >>>>> - */ >>>>> - if (cb->immd_down && cb->immnd_down && cb->amfnd_down && >>>>> cb->amfd_down && cb->fm_down) { >>>>> - LOG_NO("Core services went down on node_id: %x", >>>>> fm_mbx_evt->node_id); >>>>> - fm_send_node_down_to_mbx(cb, fm_mbx_evt->node_id); >>>>> - /* Reset peer downs, because we've made MDS RED >>>>> subscriptions */ >>>>> - cb->immd_down = false; >>>>> - cb->immnd_down = false; >>>>> - cb->amfnd_down = false; >>>>> - cb->amfd_down = false; >>>>> - cb->fm_down = false; >>>>> - } >>>>> -} >>>>> - >>>>> >>>>> /**************************************************************************** >>>>> >>>>> >>>>> >>>>> * Name : fm_clm_init >>>>> * >>>>> @@ -642,11 +646,10 @@ static void fm_mbx_msg_handler(FM_CB *fm >>>>> } >>>>> } >>>>> break; >>>>> - case FM_EVT_SVC_DOWN: >>>>> - fm_proc_svc_down(fm_cb, fm_mbx_evt); >>>>> - break; >>>>> + >>>>> case FM_EVT_PEER_UP: >>>>> -/* Peer fm came up so sending ee_id of this node */ >>>>> + >>>>> + /* Peer fm came up so sending ee_id of this node */ >>>>> if (fm_cb->node_name.length != 0) >>>>> fms_fms_exchange_node_info(fm_cb); >>>>> >>>>> @@ -654,8 +657,9 @@ static void fm_mbx_msg_handler(FM_CB *fm >>>>> get_peer_clm_node_name(fm_mbx_evt->node_id); >>>>> } >>>>> break; >>>>> + >>>>> case FM_EVT_TMR_EXP: >>>>> -/* Timer Expiry event posted */ >>>>> + /* Timer Expiry event posted */ >>>>> if (fm_mbx_evt->info.fm_tmr->type == >>>>> FM_TMR_PROMOTE_ACTIVE) { >>>>> /* Check whether node(AMF) initialization is done */ >>>>> if (fm_cb->csi_assigned == false) { >>>>> @@ -684,9 +688,11 @@ static void fm_mbx_msg_handler(FM_CB *fm >>>>> "within the time limit"); >>>>> } >>>>> break; >>>>> + >>>>> case FM_EVT_RDA_ROLE: >>>>> fm_evt_proc_rda_callback(fm_cb, fm_mbx_evt); >>>>> break; >>>>> + >>>>> default: >>>>> break; >>>>> } >>>>> diff --git a/src/fm/fmd/fm_mds.c b/src/fm/fmd/fm_mds.c >>>>> --- a/src/fm/fmd/fm_mds.c >>>>> +++ b/src/fm/fmd/fm_mds.c >>>>> @@ -1,6 +1,7 @@ >>>>> /* -*- OpenSAF -*- >>>>> * >>>>> * (C) Copyright 2008 The OpenSAF Foundation >>>>> +* Copyright (C) 2017, Oracle and/or its affiliates. All rights >>>>> reserved. >>>>> * >>>>> * This program is distributed in the hope that it will be useful, >>>>> but >>>>> * WITHOUT ANY WARRANTY; without even the implied warranty of >>>>> MERCHANTABILITY >>>>> @@ -34,6 +35,7 @@ static void check_for_node_isolation(FM_ >>>>> static bool has_been_well_connected_recently(FM_CB *cb); >>>>> static uint32_t fm_mds_node_evt(FM_CB *cb, >>>>> MDS_CALLBACK_NODE_EVENT_INFO * node_evt); >>>>> static uint32_t fm_fill_mds_evt_post_fm_mbx(FM_CB *cb, FM_EVT >>>>> *fm_evt, NODE_ID node_id, FM_FSM_EVT_CODE evt_code); >>>>> +static void fm_proc_svc_down(FM_CB *cb, uint32_t node_id, >>>>> NCSMDS_SVC_ID svc_id); >>>>> >>>>> uint32_t >>>>> fm_mds_sync_send(FM_CB *fm_cb, NCSCONTEXT msg, >>>>> @@ -62,7 +64,7 @@ uint32_t fm_mds_init(FM_CB *cb) >>>>> { >>>>> NCSMDS_INFO arg; >>>>> MDS_SVC_ID svc_id[] = { NCSMDS_SVC_ID_GFM, NCSMDS_SVC_ID_AVND, >>>>> NCSMDS_SVC_ID_IMMND }; >>>>> - MDS_SVC_ID immd_id[2] = { NCSMDS_SVC_ID_IMMD, >>>>> NCSMDS_SVC_ID_AVD }; >>>>> + MDS_SVC_ID svc_red_id[2] = { NCSMDS_SVC_ID_IMMD, >>>>> NCSMDS_SVC_ID_AVD }; >>>>> >>>>> /* Get the MDS handles to be used. */ >>>>> if (fm_mds_get_adest_hdls(cb) != NCSCC_RC_SUCCESS) { >>>>> @@ -111,7 +113,7 @@ uint32_t fm_mds_init(FM_CB *cb) >>>>> arg.i_op = MDS_RED_SUBSCRIBE; >>>>> arg.info.svc_subscribe.i_num_svcs = 2; >>>>> arg.info.svc_subscribe.i_scope = NCSMDS_SCOPE_NONE; >>>>> - arg.info.svc_subscribe.i_svc_ids = immd_id; >>>>> + arg.info.svc_subscribe.i_svc_ids = svc_red_id; >>>>> if (ncsmds_api(&arg) == NCSCC_RC_FAILURE) { >>>>> syslog(LOG_ERR, "MDS_RED_SUBSCRIBE failed"); >>>>> arg.i_op = MDS_UNINSTALL; >>>>> @@ -285,25 +287,56 @@ uint32_t fm_send_node_down_to_mbx(FM_CB >>>>> return rc; >>>>> } >>>>> >>>>> -static void fm_send_svc_down_to_mbx(FM_CB *cb, uint32_t node_id, >>>>> NCSMDS_SVC_ID svc_id) >>>>> +void fm_proc_svc_down(FM_CB *cb, uint32_t node_id, NCSMDS_SVC_ID >>>>> svc_id) >>>>> { >>>>> - FM_EVT *fm_evt = NULL; >>>>> - uint32_t rc = NCSCC_RC_SUCCESS; >>>>> - fm_evt = m_MMGR_ALLOC_FM_EVT; >>>>> - if (NULL == fm_evt) { >>>>> - syslog(LOG_INFO, "fm_mds_rcv_evt: fm_evt allocation >>>>> FAILED."); >>>>> - return; >>>>> + TRACE_ENTER2("SVC ID: %d", (int) svc_id); >>>>> + switch (svc_id) { >>>>> + case NCSMDS_SVC_ID_IMMND: >>>>> + cb->immnd_down = true; >>>>> + LOG_NO("IMMND down on: %x", cb->peer_node_id); >>>>> + break; >>>>> + case NCSMDS_SVC_ID_AVND: >>>>> + cb->amfnd_down = true; >>>>> + LOG_NO("AMFND down on: %x", cb->peer_node_id); >>>>> + break; >>>>> + case NCSMDS_SVC_ID_IMMD: >>>>> + cb->immd_down = true; >>>>> + LOG_NO("IMMD down on: %x", cb->peer_node_id); >>>>> + break; >>>>> + case NCSMDS_SVC_ID_AVD: >>>>> + cb->amfd_down = true; >>>>> + LOG_NO("AVD down on: %x", cb->peer_node_id); >>>>> + break; >>>>> + case NCSMDS_SVC_ID_GFM: >>>>> + cb->fm_down = true; >>>>> + LOG_NO("FM down on: %x", cb->peer_node_id); >>>>> + break; >>>>> + default: >>>>> + break; >>>>> } >>>>> - fm_evt->svc_id = svc_id; >>>>> - rc = fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, node_id, >>>>> FM_EVT_SVC_DOWN); >>>>> - if (rc == NCSCC_RC_FAILURE) { >>>>> - m_MMGR_FREE_FM_EVT(fm_evt); >>>>> - LOG_IN("service down event post to mailbox failed"); >>>>> - fm_evt = NULL; >>>>> + >>>>> + /* Processing only for alternate node. >>>>> + * Service downs of AMFND, IMMD, IMMND is the same as NODE_DOWN >>>>> from 4.4 onwards. >>>>> + * This is required to handle the usecase involving >>>>> + * '/etc/init.d/opensafd stop' without an OS reboot cycle >>>>> + * Process service downs only if OpenSAF is not controlling >>>>> TIPC. >>>>> + * If OpenSAF is controlling TIPC, just wait for NODE_DOWN to >>>>> trigger failover. >>>>> + */ >>>>> + if (cb->immd_down && cb->immnd_down && cb->amfnd_down && >>>>> cb->amfd_down && cb->fm_down) { >>>>> + LOG_NO("Core services went down on node_id: %x", node_id); >>>>> + if (cb->peer_down_await) { >>>>> + cb->peer_down_await = 0; >>>>> + m_NCS_SEL_OBJ_IND(&cb->peer_down_obj); >>>>> + } >>>>> + >>>>> + if(!cb->control_tipc) >>>>> + fm_send_node_down_to_mbx(cb, node_id); >>>>> } >>>>> - return; >>>>> + >>>>> + TRACE_LEAVE(); >>>>> } >>>>> >>>>> + >>>>> static void check_for_node_isolation(FM_CB *cb) >>>>> { >>>>> bool well_connected = cb->peer_sc_up && cb->cluster_size >= 3; >>>>> @@ -393,8 +426,7 @@ static uint32_t fm_mds_node_evt(FM_CB *c >>>>> >>>>> *****************************************************************************/ >>>>> >>>>> >>>>> >>>>> static uint32_t fm_mds_svc_evt(FM_CB *cb, >>>>> MDS_CALLBACK_SVC_EVENT_INFO *svc_evt) >>>>> { >>>>> - uint32_t return_val = NCSCC_RC_SUCCESS; >>>>> - FM_EVT *fm_evt; >>>>> + FM_EVT *fm_evt = NULL; >>>>> TRACE_ENTER(); >>>>> >>>>> if (NULL == svc_evt) { >>>>> @@ -413,43 +445,29 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb >>>>> cb->peer_sc_up = false; >>>>> check_for_node_isolation(cb); >>>>> cb->peer_adest = 0; >>>>> - if (!cb->control_tipc) { >>>>> - fm_send_svc_down_to_mbx(cb, >>>>> svc_evt->i_node_id, svc_evt->i_svc_id); >>>>> - } >>>>> + >>>>> + fm_proc_svc_down(cb, svc_evt->i_node_id, >>>>> svc_evt->i_svc_id); >>>>> } >>>>> break; >>>>> case NCSMDS_SVC_ID_IMMND: >>>>> - if (svc_evt->i_node_id == cb->peer_node_id >>>>> - && !cb->control_tipc) { >>>>> - fm_send_svc_down_to_mbx(cb, svc_evt->i_node_id, >>>>> svc_evt->i_svc_id); >>>>> - } >>>>> - break; >>>>> case NCSMDS_SVC_ID_AVND: >>>>> - if (svc_evt->i_node_id == cb->peer_node_id >>>>> - && !cb->control_tipc) { >>>>> - fm_send_svc_down_to_mbx(cb, svc_evt->i_node_id, >>>>> svc_evt->i_svc_id); >>>>> + if (svc_evt->i_node_id == cb->peer_node_id) { >>>>> + fm_proc_svc_down(cb, svc_evt->i_node_id, >>>>> svc_evt->i_svc_id); >>>>> } >>>>> break; >>>>> default: >>>>> TRACE("Not interested in service down of other >>>>> services"); >>>>> break; >>>>> } >>>>> - >>>>> break; >>>>> >>>>> case NCSMDS_RED_DOWN: >>>>> switch (svc_evt->i_svc_id) { >>>>> /* Depend on service downs if OpenSAF is not controling >>>>> TIPC */ >>>>> case NCSMDS_SVC_ID_IMMD: >>>>> - if (svc_evt->i_node_id == cb->peer_node_id >>>>> - && !cb->control_tipc) { >>>>> - fm_send_svc_down_to_mbx(cb, svc_evt->i_node_id, >>>>> svc_evt->i_svc_id); >>>>> - } >>>>> - break; >>>>> case NCSMDS_SVC_ID_AVD: >>>>> - if (svc_evt->i_node_id == cb->peer_node_id >>>>> - && !cb->control_tipc) { >>>>> - fm_send_svc_down_to_mbx(cb, svc_evt->i_node_id, >>>>> svc_evt->i_svc_id); >>>>> + if (svc_evt->i_node_id == cb->peer_node_id) { >>>>> + fm_proc_svc_down(cb, svc_evt->i_node_id, >>>>> svc_evt->i_svc_id); >>>>> } >>>>> break; >>>>> default: >>>>> @@ -465,43 +483,96 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb >>>>> TRACE("Peer fm status change: %d -> %d, peer node id >>>>> is: %x, cluster size is %llu", >>>>> (int) cb->peer_sc_up, 1, svc_evt->i_node_id, >>>>> (unsigned long long) cb->cluster_size); >>>>> cb->peer_sc_up = true; >>>>> + cb->fm_down = false; >>>>> check_for_node_isolation(cb); >>>>> >>>>> fm_evt = m_MMGR_ALLOC_FM_EVT; >>>>> - if (NULL == fm_evt) { >>>>> - syslog(LOG_INFO, "fm_mds_svc_evt: fm_evt >>>>> allocation FAILED."); >>>>> - return NCSCC_RC_FAILURE; >>>>> - } >>>>> + if (NULL == fm_evt) { >>>>> + syslog(LOG_INFO, "fm_mds_svc_evt: fm_evt >>>>> allocation FAILED."); >>>>> + return NCSCC_RC_FAILURE; >>>>> + } >>>>> + >>>>> cb->peer_adest = svc_evt->i_dest; >>>>> cb->peer_node_id = svc_evt->i_node_id; >>>>> cb->peer_node_terminated = false; >>>>> - return_val = fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, >>>>> cb->peer_node_id, FM_EVT_PEER_UP); >>>>> >>>>> - if (NCSCC_RC_FAILURE == return_val) { >>>>> - m_MMGR_FREE_FM_EVT(fm_evt); >>>>> - fm_evt = NULL; >>>>> - } >>>>> + if(fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, >>>>> cb->peer_node_id, FM_EVT_PEER_UP) == NCSCC_RC_FAILURE) >>>>> + { >>>>> + m_MMGR_FREE_FM_EVT(fm_evt); >>>>> + fm_evt = NULL; >>>>> + } >>>>> } >>>>> break; >>>>> + >>>>> case NCSMDS_SVC_ID_IMMND: >>>>> - if (svc_evt->i_node_id == cb->peer_node_id >>>>> - && !cb->control_tipc) >>>>> - cb->immnd_down = false; /* Only IMMND is >>>>> restartable */ >>>>> + if (svc_evt->i_node_id == cb->peer_node_id){ >>>>> + TRACE("Peer immnd status change: %d -> %d, peer node >>>>> id is: %x, cluster size is %llu", >>>>> + (int) cb->peer_sc_up, 1, svc_evt->i_node_id, >>>>> (unsigned long long) cb->cluster_size); >>>>> + cb->immnd_down = false; >>>>> + } >>>>> + break; >>>>> + >>>>> + case NCSMDS_SVC_ID_AVND: >>>>> + if (svc_evt->i_node_id == cb->peer_node_id){ >>>>> + TRACE("Peer amfnd status change: %d -> %d, peer node >>>>> id is: %x, cluster size is %llu", >>>>> + (int) cb->peer_sc_up, 1, svc_evt->i_node_id, >>>>> (unsigned long long) cb->cluster_size); >>>>> + cb->amfnd_down = false; >>>>> + } >>>>> break; >>>>> default: >>>>> break; >>>>> } >>>>> break; >>>>> >>>>> + case NCSMDS_RED_UP: >>>>> + switch (svc_evt->i_svc_id) { >>>>> + /* Depend on service downs if OpenSAF is not controling >>>>> TIPC */ >>>>> + case NCSMDS_SVC_ID_IMMD: >>>>> + if (svc_evt->i_node_id != cb->node_id) { >>>>> + TRACE("Peer immd status change: %d -> %d, peer node >>>>> id is: %x, cluster size is %llu", >>>>> + (int) cb->peer_sc_up, 1, svc_evt->i_node_id, >>>>> (unsigned long long) cb->cluster_size); >>>>> + cb->peer_node_id = svc_evt->i_node_id; >>>>> + cb->immd_down = false; >>>>> + >>>>> + /* Arrived svc up event of amfd/amfnd/immd/immnd >>>>> svc's with out fm svc-up event being arrived. >>>>> + * It can be due to peer node is going down but not >>>>> fully down. hence reboot the node. >>>>> + */ >>>>> + if (!fm_cb->peer_sc_up) >>>>> + opensaf_reboot(0, NULL, "Peer is not completely >>>>> DOWN, Received svc up of peer IMMD"); >>>>> + } >>>>> + break; >>>>> + >>>>> + case NCSMDS_SVC_ID_AVD: >>>>> + if (svc_evt->i_node_id != cb->node_id) { >>>>> + TRACE("Peer amfd status change: %d -> %d, peer node >>>>> id is: %x, cluster size is %llu", >>>>> + (int) cb->peer_sc_up, 1, svc_evt->i_node_id, >>>>> (unsigned long long) cb->cluster_size); >>>>> + cb->peer_node_id = svc_evt->i_node_id; >>>>> + cb->amfd_down = false; >>>>> + >>>>> + /* Arrived svc up event of amfd/amfnd/immd/immnd >>>>> svc's with out fm svc-up event being arrived. >>>>> + * It can be due to peer node is going down but not >>>>> fully down. hence reboot the node. >>>>> + */ >>>>> + if (!fm_cb->peer_sc_up) >>>>> + opensaf_reboot(0, NULL, "Peer is not completely >>>>> DOWN, Received svc up of peer AMFD"); >>>>> + } >>>>> + break; >>>>> + >>>>> + default: >>>>> + TRACE("Not interested in service down of other >>>>> services"); >>>>> + break; >>>>> + } >>>>> + break; >>>>> + >>>>> default: >>>>> syslog(LOG_INFO, "Wrong MDS event"); >>>>> break; >>>>> } >>>>> >>>>> TRACE_LEAVE(); >>>>> - return return_val; >>>>> + return NCSCC_RC_SUCCESS; >>>>> } >>>>> >>>>> + >>>>> >>>>> /*************************************************************************** >>>>> >>>>> >>>>> >>>>> * Name : fm_mds_rcv_evt >>>>> * >>>>> >>> > ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, SlashDot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel