Ack (Code review only). Thanks -Nagu
> -----Original Message----- > From: ramesh betham > Sent: 02 March 2017 15:39 > To: [email protected] > Subject: Re: [devel] [PATCH 1 of 1] osaf:fm on new-Active handling amfd up > event of peer old-Active node which is going down[#2151] V2 > > Hi, > > Correctionto the patch. > > + /* Weird situation in a cluster, where the new-Active > controller node founds the peer node > + * (old-Active) is still in the progress of shutdown (i.e., > amfd/immd is still alive). > + */ > + if ((fm_cb->role == PCS_RDA_ACTIVE) && (fm_cb- > >csi_assigned == false)) { > + LOG_ER("Two active controllers observed in a > cluster, newActive: %x and old-Active: %x", fm_cb->node_id, fm_cb- > >peer_node_id); > + opensaf_reboot(fm_cb->peer_node_id, NULL, > correction: opensaf_reboot(0, NULL, > + "Received svc up from peer node (old-active is not > fully DOWN), hence rebooting the new Active"); > + } > > Thanks, > Ramesh. > > On 3/2/2017 2:02 PM, [email protected] wrote: > > src/fm/fmd/fm_evt.h | 2 +- > > src/fm/fmd/fm_main.c | 78 ++++++--------------- > > src/fm/fmd/fm_mds.c | 181 > ++++++++++++++++++++++++++++++++++++-------------- > > 3 files changed, 155 insertions(+), 106 deletions(-) > > > > > > diff --git a/src/fm/fmd/fm_evt.h b/src/fm/fmd/fm_evt.h > > --- a/src/fm/fmd/fm_evt.h > > +++ b/src/fm/fmd/fm_evt.h > > @@ -1,6 +1,7 @@ > > /* -*- OpenSAF -*- > > * > > * (C) Copyright 2008 The OpenSAF Foundation > > +* Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved. > > * > > * This program is distributed in the hope that it will be useful, but > > * WITHOUT ANY WARRANTY; without even the implied warranty of > > MERCHANTABILITY @@ -49,7 +50,6 @@ typedef enum { > > FM_EVT_NODE_DOWN, > > FM_EVT_PEER_UP, > > FM_EVT_RDA_ROLE, > > - FM_EVT_SVC_DOWN, > > FM_FSM_EVT_MAX > > } FM_FSM_EVT_CODE; > > > > diff --git a/src/fm/fmd/fm_main.c b/src/fm/fmd/fm_main.c > > --- a/src/fm/fmd/fm_main.c > > +++ b/src/fm/fmd/fm_main.c > > @@ -1,6 +1,7 @@ > > /* -*- OpenSAF -*- > > * > > * (C) Copyright 2008 The OpenSAF Foundation > > +* Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved. > > * > > * This program is distributed in the hope that it will be useful, but > > * WITHOUT ANY WARRANTY; without even the implied warranty of > > MERCHANTABILITY @@ -31,6 +32,7 @@ This file contains the main() > routine fo > > #include "nid/agent/nid_api.h" > > #include "fm.h" > > #include "base/osaf_time.h" > > +#include "base/osaf_poll.h" > > > > #define FM_CLM_API_TIMEOUT 10000000000LL > > > > @@ -71,7 +73,6 @@ void handle_mbx_event(void); > > extern uint32_t fm_amf_init(FM_AMF_CB *fm_amf_cb); > > uint32_t gl_fm_hdl; > > static NCS_SEL_OBJ usr1_sel_obj; > > -void fm_proc_svc_down(FM_CB *cb, FM_EVT *fm_mbx_evt); > > > > /** > > * USR1 signal is used when AMF wants instantiate us as a @@ -176,6 > > +177,11 @@ int main(int argc, char *argv[]) > > */ > > fm_cb->control_tipc = true; /* Default behaviour */ > > > > + fm_cb->immd_down = true; > > + fm_cb->immnd_down = true; > > + fm_cb->amfnd_down = true; > > + fm_cb->amfd_down = true; > > + > > /* Create CB handle */ > > gl_fm_hdl = ncshm_create_hdl(NCS_HM_POOL_ID_COMMON, > > NCS_SERVICE_ID_GFM, (NCSCONTEXT)fm_cb); > > > > @@ -194,7 +200,7 @@ int main(int argc, char *argv[]) > > goto fm_init_failed; > > } > > > > -/* Attach MBX */ > > + /* Attach MBX */ > > if (m_NCS_IPC_ATTACH(&fm_cb->mbx) != NCSCC_RC_SUCCESS) { > > syslog(LOG_ERR, "m_NCS_IPC_ATTACH() failed."); > > goto fm_init_failed; > > @@ -268,7 +274,7 @@ int main(int argc, char *argv[]) > > > > /* notify the NID */ > > if (nid_started) > > - fm_nid_notify(NCSCC_RC_SUCCESS); > > + fm_nid_notify((uint32_t) NCSCC_RC_SUCCESS); > > > > while (1) { > > ret = poll(fds, nfds, -1); > > @@ -454,52 +460,6 @@ static uint32_t fm_get_args(FM_CB *fm_cb > > return NCSCC_RC_SUCCESS; > > } > > > > -void fm_proc_svc_down(FM_CB *cb, FM_EVT *fm_mbx_evt) -{ > > - switch (fm_mbx_evt->svc_id) { > > - case NCSMDS_SVC_ID_IMMND: > > - cb->immnd_down = true; > > - LOG_NO("IMMND down on: %x", cb- > >peer_node_id); > > - break; > > - case NCSMDS_SVC_ID_AVND: > > - cb->amfnd_down = true; > > - LOG_NO("AMFND down on: %x", cb->peer_node_id); > > - break; > > - case NCSMDS_SVC_ID_IMMD: > > - cb->immd_down = true; > > - LOG_NO("IMMD down on: %x", cb->peer_node_id); > > - break; > > - case NCSMDS_SVC_ID_AVD: > > - cb->amfd_down = true; > > - LOG_NO("AVD down on: %x", cb->peer_node_id); > > - break; > > - case NCSMDS_SVC_ID_GFM: > > - cb->fm_down = true; > > - LOG_NO("FM down on: %x", cb->peer_node_id); > > - break; > > - default: > > - break; > > - } > > - > > - /* Processing only for alternate node. > > - * Service downs of AMFND, IMMD, IMMND is the same as > NODE_DOWN from 4.4 onwards. > > - * This is required to handle the usecase involving > > - * '/etc/init.d/opensafd stop' without an OS reboot cycle > > - * Process service downs only if OpenSAF is not controlling TIPC. > > - * If OpenSAF is controlling TIPC, just wait for NODE_DOWN to > trigger failover. > > - */ > > - if (cb->immd_down && cb->immnd_down && cb->amfnd_down && > cb->amfd_down && cb->fm_down) { > > - LOG_NO("Core services went down on node_id: %x", > fm_mbx_evt->node_id); > > - fm_send_node_down_to_mbx(cb, fm_mbx_evt->node_id); > > - /* Reset peer downs, because we've made MDS RED > subscriptions */ > > - cb->immd_down = false; > > - cb->immnd_down = false; > > - cb->amfnd_down = false; > > - cb->amfd_down = false; > > - cb->fm_down = false; > > - } > > -} > > - > > > /************************************************************* > *************** > > * Name : fm_clm_init > > * > > @@ -642,11 +602,18 @@ static void fm_mbx_msg_handler(FM_CB *fm > > } > > } > > break; > > - case FM_EVT_SVC_DOWN: > > - fm_proc_svc_down(fm_cb, fm_mbx_evt); > > - break; > > + > > case FM_EVT_PEER_UP: > > -/* Peer fm came up so sending ee_id of this node */ > > + /* Weird situation in a cluster, where the new-Active > controller node founds the peer node > > + * (old-Active) is still in the progress of shutdown (i.e., > amfd/immd is still alive). > > + */ > > + if ((fm_cb->role == PCS_RDA_ACTIVE) && (fm_cb- > >csi_assigned == false)) { > > + LOG_ER("Two active controllers observed in a > cluster, newActive: %x and old-Active: %x", fm_cb->node_id, fm_cb- > >peer_node_id); > > + opensaf_reboot(fm_cb->peer_node_id, NULL, > > + "Received svc up from peer node (old-active is not > fully DOWN), hence rebooting the new Active"); > > + } > > + > > + /* Peer fm came up so sending ee_id of this node */ > > if (fm_cb->node_name.length != 0) > > fms_fms_exchange_node_info(fm_cb); > > > > @@ -654,8 +621,9 @@ static void fm_mbx_msg_handler(FM_CB *fm > > get_peer_clm_node_name(fm_mbx_evt->node_id); > > } > > break; > > + > > case FM_EVT_TMR_EXP: > > -/* Timer Expiry event posted */ > > + /* Timer Expiry event posted */ > > if (fm_mbx_evt->info.fm_tmr->type == > FM_TMR_PROMOTE_ACTIVE) { > > /* Check whether node(AMF) initialization is done */ > > if (fm_cb->csi_assigned == false) { @@ -684,9 > +652,11 @@ static > > void fm_mbx_msg_handler(FM_CB *fm > > "within the time limit"); > > } > > break; > > + > > case FM_EVT_RDA_ROLE: > > fm_evt_proc_rda_callback(fm_cb, fm_mbx_evt); > > break; > > + > > default: > > break; > > } > > diff --git a/src/fm/fmd/fm_mds.c b/src/fm/fmd/fm_mds.c > > --- a/src/fm/fmd/fm_mds.c > > +++ b/src/fm/fmd/fm_mds.c > > @@ -1,6 +1,7 @@ > > /* -*- OpenSAF -*- > > * > > * (C) Copyright 2008 The OpenSAF Foundation > > +* Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved. > > * > > * This program is distributed in the hope that it will be useful, but > > * WITHOUT ANY WARRANTY; without even the implied warranty of > > MERCHANTABILITY @@ -34,6 +35,7 @@ static void > check_for_node_isolation(FM_ > > static bool has_been_well_connected_recently(FM_CB *cb); > > static uint32_t fm_mds_node_evt(FM_CB *cb, > MDS_CALLBACK_NODE_EVENT_INFO * node_evt); > > static uint32_t fm_fill_mds_evt_post_fm_mbx(FM_CB *cb, FM_EVT > > *fm_evt, NODE_ID node_id, FM_FSM_EVT_CODE evt_code); > > +static void fm_proc_svc_down(FM_CB *cb, uint32_t node_id, > > +NCSMDS_SVC_ID svc_id); > > > > uint32_t > > fm_mds_sync_send(FM_CB *fm_cb, NCSCONTEXT msg, @@ -62,7 +64,7 > @@ > > uint32_t fm_mds_init(FM_CB *cb) > > { > > NCSMDS_INFO arg; > > MDS_SVC_ID svc_id[] = { NCSMDS_SVC_ID_GFM, > NCSMDS_SVC_ID_AVND, NCSMDS_SVC_ID_IMMND }; > > - MDS_SVC_ID immd_id[2] = { NCSMDS_SVC_ID_IMMD, > NCSMDS_SVC_ID_AVD }; > > + MDS_SVC_ID svc_red_id[2] = { NCSMDS_SVC_ID_IMMD, > NCSMDS_SVC_ID_AVD > > +}; > > > > /* Get the MDS handles to be used. */ > > if (fm_mds_get_adest_hdls(cb) != NCSCC_RC_SUCCESS) { @@ -111,7 > > +113,7 @@ uint32_t fm_mds_init(FM_CB *cb) > > arg.i_op = MDS_RED_SUBSCRIBE; > > arg.info.svc_subscribe.i_num_svcs = 2; > > arg.info.svc_subscribe.i_scope = NCSMDS_SCOPE_NONE; > > - arg.info.svc_subscribe.i_svc_ids = immd_id; > > + arg.info.svc_subscribe.i_svc_ids = svc_red_id; > > if (ncsmds_api(&arg) == NCSCC_RC_FAILURE) { > > syslog(LOG_ERR, "MDS_RED_SUBSCRIBE failed"); > > arg.i_op = MDS_UNINSTALL; > > @@ -285,25 +287,52 @@ uint32_t fm_send_node_down_to_mbx(FM_CB > > return rc; > > } > > > > -static void fm_send_svc_down_to_mbx(FM_CB *cb, uint32_t node_id, > > NCSMDS_SVC_ID svc_id) > > +void fm_proc_svc_down(FM_CB *cb, uint32_t node_id, NCSMDS_SVC_ID > > +svc_id) > > { > > - FM_EVT *fm_evt = NULL; > > - uint32_t rc = NCSCC_RC_SUCCESS; > > - fm_evt = m_MMGR_ALLOC_FM_EVT; > > - if (NULL == fm_evt) { > > - syslog(LOG_INFO, "fm_mds_rcv_evt: fm_evt allocation > FAILED."); > > - return; > > + TRACE_ENTER2("SVC ID: %d", (int) svc_id); > > + switch (svc_id) { > > + case NCSMDS_SVC_ID_IMMND: > > + cb->immnd_down = true; > > + LOG_NO("IMMND down on: %x", cb- > >peer_node_id); > > + break; > > + case NCSMDS_SVC_ID_AVND: > > + cb->amfnd_down = true; > > + LOG_NO("AMFND down on: %x", cb->peer_node_id); > > + break; > > + case NCSMDS_SVC_ID_IMMD: > > + cb->immd_down = true; > > + LOG_NO("IMMD down on: %x", cb->peer_node_id); > > + break; > > + case NCSMDS_SVC_ID_AVD: > > + cb->amfd_down = true; > > + LOG_NO("AVD down on: %x", cb->peer_node_id); > > + break; > > + case NCSMDS_SVC_ID_GFM: > > + cb->fm_down = true; > > + LOG_NO("FM down on: %x", cb->peer_node_id); > > + break; > > + default: > > + break; > > } > > - fm_evt->svc_id = svc_id; > > - rc = fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, node_id, > FM_EVT_SVC_DOWN); > > - if (rc == NCSCC_RC_FAILURE) { > > - m_MMGR_FREE_FM_EVT(fm_evt); > > - LOG_IN("service down event post to mailbox failed"); > > - fm_evt = NULL; > > + > > + /* Processing only for alternate node. > > + * Service downs of AMFND, IMMD, IMMND is the same as > NODE_DOWN from 4.4 onwards. > > + * This is required to handle the usecase involving > > + * '/etc/init.d/opensafd stop' without an OS reboot cycle > > + * Process service downs only if OpenSAF is not controlling TIPC. > > + * If OpenSAF is controlling TIPC, just wait for NODE_DOWN to > trigger failover. > > + */ > > + if (cb->immd_down && cb->immnd_down && cb->amfnd_down && > cb->amfd_down && cb->fm_down) { > > + LOG_NO("Core services went down on node_id: %x", > node_id); > > + > > + if(!cb->control_tipc) > > + fm_send_node_down_to_mbx(cb, node_id); > > } > > - return; > > + > > + TRACE_LEAVE(); > > } > > > > + > > static void check_for_node_isolation(FM_CB *cb) > > { > > bool well_connected = cb->peer_sc_up && cb->cluster_size >= 3; > @@ > > -393,8 +422,7 @@ static uint32_t fm_mds_node_evt(FM_CB *c > > > ************************************************************** > ***************/ > > static uint32_t fm_mds_svc_evt(FM_CB *cb, > MDS_CALLBACK_SVC_EVENT_INFO *svc_evt) > > { > > - uint32_t return_val = NCSCC_RC_SUCCESS; > > - FM_EVT *fm_evt; > > + FM_EVT *fm_evt = NULL; > > TRACE_ENTER(); > > > > if (NULL == svc_evt) { > > @@ -413,43 +441,29 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb > > cb->peer_sc_up = false; > > check_for_node_isolation(cb); > > cb->peer_adest = 0; > > - if (!cb->control_tipc) { > > - > fm_send_svc_down_to_mbx(cb, svc_evt->i_node_id, svc_evt- > >i_svc_id); > > - } > > + > > + fm_proc_svc_down(cb, svc_evt- > >i_node_id, svc_evt->i_svc_id); > > } > > break; > > case NCSMDS_SVC_ID_IMMND: > > - if (svc_evt->i_node_id == cb->peer_node_id > > - && !cb- > >control_tipc) { > > - fm_send_svc_down_to_mbx(cb, > svc_evt->i_node_id, svc_evt->i_svc_id); > > - } > > - break; > > case NCSMDS_SVC_ID_AVND: > > - if (svc_evt->i_node_id == cb->peer_node_id > > - && !cb- > >control_tipc) { > > - fm_send_svc_down_to_mbx(cb, > svc_evt->i_node_id, svc_evt->i_svc_id); > > + if (svc_evt->i_node_id == cb->peer_node_id) > { > > + fm_proc_svc_down(cb, svc_evt- > >i_node_id, svc_evt->i_svc_id); > > } > > break; > > default: > > TRACE("Not interested in service down of > other services"); > > break; > > } > > - > > break; > > > > case NCSMDS_RED_DOWN: > > switch (svc_evt->i_svc_id) { > > /* Depend on service downs if OpenSAF is not > controling TIPC */ > > case NCSMDS_SVC_ID_IMMD: > > - if (svc_evt->i_node_id == cb->peer_node_id > > - && !cb- > >control_tipc) { > > - fm_send_svc_down_to_mbx(cb, > svc_evt->i_node_id, svc_evt->i_svc_id); > > - } > > - break; > > case NCSMDS_SVC_ID_AVD: > > - if (svc_evt->i_node_id == cb->peer_node_id > > - && !cb- > >control_tipc) { > > - fm_send_svc_down_to_mbx(cb, > svc_evt->i_node_id, svc_evt->i_svc_id); > > + if (svc_evt->i_node_id == cb->peer_node_id) > { > > + fm_proc_svc_down(cb, svc_evt- > >i_node_id, svc_evt->i_svc_id); > > } > > break; > > default: > > @@ -465,43 +479,108 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb > > TRACE("Peer fm status change: %d -> %d, > peer node id is: %x, cluster size is %llu", > > (int) cb->peer_sc_up, 1, svc_evt- > >i_node_id, (unsigned long long) cb->cluster_size); > > cb->peer_sc_up = true; > > + cb->fm_down = false; > > check_for_node_isolation(cb); > > > > fm_evt = m_MMGR_ALLOC_FM_EVT; > > - if (NULL == fm_evt) { > > - syslog(LOG_INFO, "fm_mds_svc_evt: > fm_evt allocation FAILED."); > > - return NCSCC_RC_FAILURE; > > - } > > + if (NULL == fm_evt) { > > + syslog(LOG_INFO, "fm_mds_svc_evt: > fm_evt allocation FAILED."); > > + return NCSCC_RC_FAILURE; > > + } > > + > > cb->peer_adest = svc_evt->i_dest; > > cb->peer_node_id = svc_evt->i_node_id; > > cb->peer_node_terminated = false; > > - return_val = > fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, cb->peer_node_id, > FM_EVT_PEER_UP); > > > > - if (NCSCC_RC_FAILURE == return_val) { > > - m_MMGR_FREE_FM_EVT(fm_evt); > > - fm_evt = NULL; > > - } > > + if(fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, > cb->peer_node_id, FM_EVT_PEER_UP) == NCSCC_RC_FAILURE) > > + { > > + m_MMGR_FREE_FM_EVT(fm_evt); > > + fm_evt = NULL; > > + } > > } > > break; > > + > > case NCSMDS_SVC_ID_IMMND: > > - if (svc_evt->i_node_id == cb->peer_node_id > > - && !cb- > >control_tipc) > > - cb->immnd_down = false; /* Only > IMMND is restartable */ > > + if (svc_evt->i_node_id == cb->peer_node_id){ > > + TRACE("Peer immnd status change: %d -> > %d, peer node id is: %x, cluster size is %llu", > > + (int) cb->peer_sc_up, 1, svc_evt- > >i_node_id, (unsigned long long) cb->cluster_size); > > + cb->immnd_down = false; > > + } > > + break; > > + > > + case NCSMDS_SVC_ID_AVND: > > + if (svc_evt->i_node_id == cb->peer_node_id){ > > + TRACE("Peer amfnd status change: %d -> %d, > peer node id is: %x, cluster size is %llu", > > + (int) cb->peer_sc_up, 1, svc_evt- > >i_node_id, (unsigned long long) cb->cluster_size); > > + cb->amfnd_down = false; > > + } > > break; > > default: > > break; > > } > > break; > > > > + case NCSMDS_RED_UP: > > + switch (svc_evt->i_svc_id) { > > + /* Depend on service downs if OpenSAF is not controling > TIPC */ > > + case NCSMDS_SVC_ID_IMMD: > > + if (svc_evt->i_node_id != cb->node_id) { > > + TRACE("Peer immd status change: %d -> %d, > peer node id is: %x, cluster size is %llu", > > + (int) cb->peer_sc_up, 1, svc_evt- > >i_node_id, (unsigned long long) cb->cluster_size); > > + cb->peer_node_id = svc_evt->i_node_id; > > + cb->immd_down = false; > > + > > + fm_evt = m_MMGR_ALLOC_FM_EVT; > > + if (NULL == fm_evt) { > > + syslog(LOG_INFO, "fm_mds_svc_evt: > fm_evt allocation FAILED."); > > + return NCSCC_RC_FAILURE; > > + } > > + > > + if(fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, > cb->peer_node_id, FM_EVT_PEER_UP) == NCSCC_RC_FAILURE) > > + { > > + m_MMGR_FREE_FM_EVT(fm_evt); > > + fm_evt = NULL; > > + } > > + } > > + break; > > + > > + case NCSMDS_SVC_ID_AVD: > > + if (svc_evt->i_node_id != cb->node_id) { > > + TRACE("Peer amfd status change: %d -> %d, > peer node id is: %x, cluster size is %llu", > > + (int) cb->peer_sc_up, 1, svc_evt- > >i_node_id, (unsigned long long) cb->cluster_size); > > + cb->peer_node_id = svc_evt->i_node_id; > > + cb->amfd_down = false; > > + > > + fm_evt = m_MMGR_ALLOC_FM_EVT; > > + if (NULL == fm_evt) { > > + syslog(LOG_INFO, "fm_mds_svc_evt: > fm_evt allocation FAILED."); > > + return NCSCC_RC_FAILURE; > > + } > > + > > + if(fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, > cb->peer_node_id, FM_EVT_PEER_UP) == NCSCC_RC_FAILURE) > > + { > > + m_MMGR_FREE_FM_EVT(fm_evt); > > + fm_evt = NULL; > > + } > > + } > > + break; > > + > > + default: > > + TRACE("Not interested in service down of other > services"); > > + break; > > + } > > + break; > > + > > default: > > syslog(LOG_INFO, "Wrong MDS event"); > > break; > > } > > > > TRACE_LEAVE(); > > - return return_val; > > + return NCSCC_RC_SUCCESS; > > } > > > > + > > > /************************************************************* > ************** > > * Name : fm_mds_rcv_evt > > * > > > > ---------------------------------------------------------------------- > > -------- Check out the vibrant tech community on one of the world's > > most engaging tech sites, SlashDot.org! http://sdm.link/slashdot > > _______________________________________________ > > Opensaf-devel mailing list > > [email protected] > > https://lists.sourceforge.net/lists/listinfo/opensaf-devel > > ------------------------------------------------------------------------------ > Check out the vibrant tech community on one of the world's most engaging > tech sites, SlashDot.org! http://sdm.link/slashdot > _______________________________________________ > Opensaf-devel mailing list > [email protected] > https://lists.sourceforge.net/lists/listinfo/opensaf-devel ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, SlashDot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
