Hi,
Correctionto the patch.
+ /* Weird situation in a cluster, where the new-Active
controller node founds the peer node
+ * (old-Active) is still in the progress of shutdown (i.e.,
amfd/immd is still alive).
+ */
+ if ((fm_cb->role == PCS_RDA_ACTIVE) && (fm_cb->csi_assigned ==
false)) {
+ LOG_ER("Two active controllers observed in a cluster,
newActive: %x and old-Active: %x", fm_cb->node_id, fm_cb->peer_node_id);
+ opensaf_reboot(fm_cb->peer_node_id, NULL,
correction: opensaf_reboot(0, NULL,
+ "Received svc up from peer node (old-active is not
fully DOWN), hence rebooting the new Active");
+ }
Thanks,
Ramesh.
On 3/2/2017 2:02 PM, [email protected] wrote:
> src/fm/fmd/fm_evt.h | 2 +-
> src/fm/fmd/fm_main.c | 78 ++++++---------------
> src/fm/fmd/fm_mds.c | 181
> ++++++++++++++++++++++++++++++++++++--------------
> 3 files changed, 155 insertions(+), 106 deletions(-)
>
>
> diff --git a/src/fm/fmd/fm_evt.h b/src/fm/fmd/fm_evt.h
> --- a/src/fm/fmd/fm_evt.h
> +++ b/src/fm/fmd/fm_evt.h
> @@ -1,6 +1,7 @@
> /* -*- OpenSAF -*-
> *
> * (C) Copyright 2008 The OpenSAF Foundation
> +* Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
> *
> * This program is distributed in the hope that it will be useful, but
> * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
> @@ -49,7 +50,6 @@ typedef enum {
> FM_EVT_NODE_DOWN,
> FM_EVT_PEER_UP,
> FM_EVT_RDA_ROLE,
> - FM_EVT_SVC_DOWN,
> FM_FSM_EVT_MAX
> } FM_FSM_EVT_CODE;
>
> diff --git a/src/fm/fmd/fm_main.c b/src/fm/fmd/fm_main.c
> --- a/src/fm/fmd/fm_main.c
> +++ b/src/fm/fmd/fm_main.c
> @@ -1,6 +1,7 @@
> /* -*- OpenSAF -*-
> *
> * (C) Copyright 2008 The OpenSAF Foundation
> +* Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
> *
> * This program is distributed in the hope that it will be useful, but
> * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
> @@ -31,6 +32,7 @@ This file contains the main() routine fo
> #include "nid/agent/nid_api.h"
> #include "fm.h"
> #include "base/osaf_time.h"
> +#include "base/osaf_poll.h"
>
> #define FM_CLM_API_TIMEOUT 10000000000LL
>
> @@ -71,7 +73,6 @@ void handle_mbx_event(void);
> extern uint32_t fm_amf_init(FM_AMF_CB *fm_amf_cb);
> uint32_t gl_fm_hdl;
> static NCS_SEL_OBJ usr1_sel_obj;
> -void fm_proc_svc_down(FM_CB *cb, FM_EVT *fm_mbx_evt);
>
> /**
> * USR1 signal is used when AMF wants instantiate us as a
> @@ -176,6 +177,11 @@ int main(int argc, char *argv[])
> */
> fm_cb->control_tipc = true; /* Default behaviour */
>
> + fm_cb->immd_down = true;
> + fm_cb->immnd_down = true;
> + fm_cb->amfnd_down = true;
> + fm_cb->amfd_down = true;
> +
> /* Create CB handle */
> gl_fm_hdl = ncshm_create_hdl(NCS_HM_POOL_ID_COMMON, NCS_SERVICE_ID_GFM,
> (NCSCONTEXT)fm_cb);
>
> @@ -194,7 +200,7 @@ int main(int argc, char *argv[])
> goto fm_init_failed;
> }
>
> -/* Attach MBX */
> + /* Attach MBX */
> if (m_NCS_IPC_ATTACH(&fm_cb->mbx) != NCSCC_RC_SUCCESS) {
> syslog(LOG_ERR, "m_NCS_IPC_ATTACH() failed.");
> goto fm_init_failed;
> @@ -268,7 +274,7 @@ int main(int argc, char *argv[])
>
> /* notify the NID */
> if (nid_started)
> - fm_nid_notify(NCSCC_RC_SUCCESS);
> + fm_nid_notify((uint32_t) NCSCC_RC_SUCCESS);
>
> while (1) {
> ret = poll(fds, nfds, -1);
> @@ -454,52 +460,6 @@ static uint32_t fm_get_args(FM_CB *fm_cb
> return NCSCC_RC_SUCCESS;
> }
>
> -void fm_proc_svc_down(FM_CB *cb, FM_EVT *fm_mbx_evt)
> -{
> - switch (fm_mbx_evt->svc_id) {
> - case NCSMDS_SVC_ID_IMMND:
> - cb->immnd_down = true;
> - LOG_NO("IMMND down on: %x", cb->peer_node_id);
> - break;
> - case NCSMDS_SVC_ID_AVND:
> - cb->amfnd_down = true;
> - LOG_NO("AMFND down on: %x", cb->peer_node_id);
> - break;
> - case NCSMDS_SVC_ID_IMMD:
> - cb->immd_down = true;
> - LOG_NO("IMMD down on: %x", cb->peer_node_id);
> - break;
> - case NCSMDS_SVC_ID_AVD:
> - cb->amfd_down = true;
> - LOG_NO("AVD down on: %x", cb->peer_node_id);
> - break;
> - case NCSMDS_SVC_ID_GFM:
> - cb->fm_down = true;
> - LOG_NO("FM down on: %x", cb->peer_node_id);
> - break;
> - default:
> - break;
> - }
> -
> - /* Processing only for alternate node.
> - * Service downs of AMFND, IMMD, IMMND is the same as NODE_DOWN from 4.4
> onwards.
> - * This is required to handle the usecase involving
> - * '/etc/init.d/opensafd stop' without an OS reboot cycle
> - * Process service downs only if OpenSAF is not controlling TIPC.
> - * If OpenSAF is controlling TIPC, just wait for NODE_DOWN to trigger
> failover.
> - */
> - if (cb->immd_down && cb->immnd_down && cb->amfnd_down && cb->amfd_down
> && cb->fm_down) {
> - LOG_NO("Core services went down on node_id: %x",
> fm_mbx_evt->node_id);
> - fm_send_node_down_to_mbx(cb, fm_mbx_evt->node_id);
> - /* Reset peer downs, because we've made MDS RED subscriptions */
> - cb->immd_down = false;
> - cb->immnd_down = false;
> - cb->amfnd_down = false;
> - cb->amfd_down = false;
> - cb->fm_down = false;
> - }
> -}
> -
>
> /****************************************************************************
> * Name : fm_clm_init
> *
> @@ -642,11 +602,18 @@ static void fm_mbx_msg_handler(FM_CB *fm
> }
> }
> break;
> - case FM_EVT_SVC_DOWN:
> - fm_proc_svc_down(fm_cb, fm_mbx_evt);
> - break;
> +
> case FM_EVT_PEER_UP:
> -/* Peer fm came up so sending ee_id of this node */
> + /* Weird situation in a cluster, where the new-Active
> controller node founds the peer node
> + * (old-Active) is still in the progress of shutdown (i.e.,
> amfd/immd is still alive).
> + */
> + if ((fm_cb->role == PCS_RDA_ACTIVE) && (fm_cb->csi_assigned ==
> false)) {
> + LOG_ER("Two active controllers observed in a cluster,
> newActive: %x and old-Active: %x", fm_cb->node_id, fm_cb->peer_node_id);
> + opensaf_reboot(fm_cb->peer_node_id, NULL,
> + "Received svc up from peer node (old-active is not
> fully DOWN), hence rebooting the new Active");
> + }
> +
> + /* Peer fm came up so sending ee_id of this node */
> if (fm_cb->node_name.length != 0)
> fms_fms_exchange_node_info(fm_cb);
>
> @@ -654,8 +621,9 @@ static void fm_mbx_msg_handler(FM_CB *fm
> get_peer_clm_node_name(fm_mbx_evt->node_id);
> }
> break;
> +
> case FM_EVT_TMR_EXP:
> -/* Timer Expiry event posted */
> + /* Timer Expiry event posted */
> if (fm_mbx_evt->info.fm_tmr->type == FM_TMR_PROMOTE_ACTIVE) {
> /* Check whether node(AMF) initialization is done */
> if (fm_cb->csi_assigned == false) {
> @@ -684,9 +652,11 @@ static void fm_mbx_msg_handler(FM_CB *fm
> "within the time limit");
> }
> break;
> +
> case FM_EVT_RDA_ROLE:
> fm_evt_proc_rda_callback(fm_cb, fm_mbx_evt);
> break;
> +
> default:
> break;
> }
> diff --git a/src/fm/fmd/fm_mds.c b/src/fm/fmd/fm_mds.c
> --- a/src/fm/fmd/fm_mds.c
> +++ b/src/fm/fmd/fm_mds.c
> @@ -1,6 +1,7 @@
> /* -*- OpenSAF -*-
> *
> * (C) Copyright 2008 The OpenSAF Foundation
> +* Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
> *
> * This program is distributed in the hope that it will be useful, but
> * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
> @@ -34,6 +35,7 @@ static void check_for_node_isolation(FM_
> static bool has_been_well_connected_recently(FM_CB *cb);
> static uint32_t fm_mds_node_evt(FM_CB *cb, MDS_CALLBACK_NODE_EVENT_INFO *
> node_evt);
> static uint32_t fm_fill_mds_evt_post_fm_mbx(FM_CB *cb, FM_EVT *fm_evt,
> NODE_ID node_id, FM_FSM_EVT_CODE evt_code);
> +static void fm_proc_svc_down(FM_CB *cb, uint32_t node_id, NCSMDS_SVC_ID
> svc_id);
>
> uint32_t
> fm_mds_sync_send(FM_CB *fm_cb, NCSCONTEXT msg,
> @@ -62,7 +64,7 @@ uint32_t fm_mds_init(FM_CB *cb)
> {
> NCSMDS_INFO arg;
> MDS_SVC_ID svc_id[] = { NCSMDS_SVC_ID_GFM, NCSMDS_SVC_ID_AVND,
> NCSMDS_SVC_ID_IMMND };
> - MDS_SVC_ID immd_id[2] = { NCSMDS_SVC_ID_IMMD, NCSMDS_SVC_ID_AVD };
> + MDS_SVC_ID svc_red_id[2] = { NCSMDS_SVC_ID_IMMD, NCSMDS_SVC_ID_AVD };
>
> /* Get the MDS handles to be used. */
> if (fm_mds_get_adest_hdls(cb) != NCSCC_RC_SUCCESS) {
> @@ -111,7 +113,7 @@ uint32_t fm_mds_init(FM_CB *cb)
> arg.i_op = MDS_RED_SUBSCRIBE;
> arg.info.svc_subscribe.i_num_svcs = 2;
> arg.info.svc_subscribe.i_scope = NCSMDS_SCOPE_NONE;
> - arg.info.svc_subscribe.i_svc_ids = immd_id;
> + arg.info.svc_subscribe.i_svc_ids = svc_red_id;
> if (ncsmds_api(&arg) == NCSCC_RC_FAILURE) {
> syslog(LOG_ERR, "MDS_RED_SUBSCRIBE failed");
> arg.i_op = MDS_UNINSTALL;
> @@ -285,25 +287,52 @@ uint32_t fm_send_node_down_to_mbx(FM_CB
> return rc;
> }
>
> -static void fm_send_svc_down_to_mbx(FM_CB *cb, uint32_t node_id,
> NCSMDS_SVC_ID svc_id)
> +void fm_proc_svc_down(FM_CB *cb, uint32_t node_id, NCSMDS_SVC_ID svc_id)
> {
> - FM_EVT *fm_evt = NULL;
> - uint32_t rc = NCSCC_RC_SUCCESS;
> - fm_evt = m_MMGR_ALLOC_FM_EVT;
> - if (NULL == fm_evt) {
> - syslog(LOG_INFO, "fm_mds_rcv_evt: fm_evt allocation FAILED.");
> - return;
> + TRACE_ENTER2("SVC ID: %d", (int) svc_id);
> + switch (svc_id) {
> + case NCSMDS_SVC_ID_IMMND:
> + cb->immnd_down = true;
> + LOG_NO("IMMND down on: %x", cb->peer_node_id);
> + break;
> + case NCSMDS_SVC_ID_AVND:
> + cb->amfnd_down = true;
> + LOG_NO("AMFND down on: %x", cb->peer_node_id);
> + break;
> + case NCSMDS_SVC_ID_IMMD:
> + cb->immd_down = true;
> + LOG_NO("IMMD down on: %x", cb->peer_node_id);
> + break;
> + case NCSMDS_SVC_ID_AVD:
> + cb->amfd_down = true;
> + LOG_NO("AVD down on: %x", cb->peer_node_id);
> + break;
> + case NCSMDS_SVC_ID_GFM:
> + cb->fm_down = true;
> + LOG_NO("FM down on: %x", cb->peer_node_id);
> + break;
> + default:
> + break;
> }
> - fm_evt->svc_id = svc_id;
> - rc = fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, node_id, FM_EVT_SVC_DOWN);
> - if (rc == NCSCC_RC_FAILURE) {
> - m_MMGR_FREE_FM_EVT(fm_evt);
> - LOG_IN("service down event post to mailbox failed");
> - fm_evt = NULL;
> +
> + /* Processing only for alternate node.
> + * Service downs of AMFND, IMMD, IMMND is the same as NODE_DOWN from
> 4.4 onwards.
> + * This is required to handle the usecase involving
> + * '/etc/init.d/opensafd stop' without an OS reboot cycle
> + * Process service downs only if OpenSAF is not controlling TIPC.
> + * If OpenSAF is controlling TIPC, just wait for NODE_DOWN to trigger
> failover.
> + */
> + if (cb->immd_down && cb->immnd_down && cb->amfnd_down && cb->amfd_down
> && cb->fm_down) {
> + LOG_NO("Core services went down on node_id: %x", node_id);
> +
> + if(!cb->control_tipc)
> + fm_send_node_down_to_mbx(cb, node_id);
> }
> - return;
> +
> + TRACE_LEAVE();
> }
>
> +
> static void check_for_node_isolation(FM_CB *cb)
> {
> bool well_connected = cb->peer_sc_up && cb->cluster_size >= 3;
> @@ -393,8 +422,7 @@ static uint32_t fm_mds_node_evt(FM_CB *c
>
> *****************************************************************************/
> static uint32_t fm_mds_svc_evt(FM_CB *cb, MDS_CALLBACK_SVC_EVENT_INFO
> *svc_evt)
> {
> - uint32_t return_val = NCSCC_RC_SUCCESS;
> - FM_EVT *fm_evt;
> + FM_EVT *fm_evt = NULL;
> TRACE_ENTER();
>
> if (NULL == svc_evt) {
> @@ -413,43 +441,29 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb
> cb->peer_sc_up = false;
> check_for_node_isolation(cb);
> cb->peer_adest = 0;
> - if (!cb->control_tipc) {
> - fm_send_svc_down_to_mbx(cb,
> svc_evt->i_node_id, svc_evt->i_svc_id);
> - }
> +
> + fm_proc_svc_down(cb,
> svc_evt->i_node_id, svc_evt->i_svc_id);
> }
> break;
> case NCSMDS_SVC_ID_IMMND:
> - if (svc_evt->i_node_id == cb->peer_node_id
> - && !cb->control_tipc) {
> - fm_send_svc_down_to_mbx(cb,
> svc_evt->i_node_id, svc_evt->i_svc_id);
> - }
> - break;
> case NCSMDS_SVC_ID_AVND:
> - if (svc_evt->i_node_id == cb->peer_node_id
> - && !cb->control_tipc) {
> - fm_send_svc_down_to_mbx(cb,
> svc_evt->i_node_id, svc_evt->i_svc_id);
> + if (svc_evt->i_node_id == cb->peer_node_id) {
> + fm_proc_svc_down(cb,
> svc_evt->i_node_id, svc_evt->i_svc_id);
> }
> break;
> default:
> TRACE("Not interested in service down of other
> services");
> break;
> }
> -
> break;
>
> case NCSMDS_RED_DOWN:
> switch (svc_evt->i_svc_id) {
> /* Depend on service downs if OpenSAF is not controling
> TIPC */
> case NCSMDS_SVC_ID_IMMD:
> - if (svc_evt->i_node_id == cb->peer_node_id
> - && !cb->control_tipc) {
> - fm_send_svc_down_to_mbx(cb,
> svc_evt->i_node_id, svc_evt->i_svc_id);
> - }
> - break;
> case NCSMDS_SVC_ID_AVD:
> - if (svc_evt->i_node_id == cb->peer_node_id
> - && !cb->control_tipc) {
> - fm_send_svc_down_to_mbx(cb,
> svc_evt->i_node_id, svc_evt->i_svc_id);
> + if (svc_evt->i_node_id == cb->peer_node_id) {
> + fm_proc_svc_down(cb,
> svc_evt->i_node_id, svc_evt->i_svc_id);
> }
> break;
> default:
> @@ -465,43 +479,108 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb
> TRACE("Peer fm status change: %d -> %d, peer
> node id is: %x, cluster size is %llu",
> (int) cb->peer_sc_up, 1,
> svc_evt->i_node_id, (unsigned long long) cb->cluster_size);
> cb->peer_sc_up = true;
> + cb->fm_down = false;
> check_for_node_isolation(cb);
>
> fm_evt = m_MMGR_ALLOC_FM_EVT;
> - if (NULL == fm_evt) {
> - syslog(LOG_INFO, "fm_mds_svc_evt:
> fm_evt allocation FAILED.");
> - return NCSCC_RC_FAILURE;
> - }
> + if (NULL == fm_evt) {
> + syslog(LOG_INFO, "fm_mds_svc_evt:
> fm_evt allocation FAILED.");
> + return NCSCC_RC_FAILURE;
> + }
> +
> cb->peer_adest = svc_evt->i_dest;
> cb->peer_node_id = svc_evt->i_node_id;
> cb->peer_node_terminated = false;
> - return_val = fm_fill_mds_evt_post_fm_mbx(cb,
> fm_evt, cb->peer_node_id, FM_EVT_PEER_UP);
>
> - if (NCSCC_RC_FAILURE == return_val) {
> - m_MMGR_FREE_FM_EVT(fm_evt);
> - fm_evt = NULL;
> - }
> + if(fm_fill_mds_evt_post_fm_mbx(cb, fm_evt,
> cb->peer_node_id, FM_EVT_PEER_UP) == NCSCC_RC_FAILURE)
> + {
> + m_MMGR_FREE_FM_EVT(fm_evt);
> + fm_evt = NULL;
> + }
> }
> break;
> +
> case NCSMDS_SVC_ID_IMMND:
> - if (svc_evt->i_node_id == cb->peer_node_id
> - && !cb->control_tipc)
> - cb->immnd_down = false; /* Only IMMND
> is restartable */
> + if (svc_evt->i_node_id == cb->peer_node_id){
> + TRACE("Peer immnd status change: %d -> %d, peer
> node id is: %x, cluster size is %llu",
> + (int) cb->peer_sc_up, 1,
> svc_evt->i_node_id, (unsigned long long) cb->cluster_size);
> + cb->immnd_down = false;
> + }
> + break;
> +
> + case NCSMDS_SVC_ID_AVND:
> + if (svc_evt->i_node_id == cb->peer_node_id){
> + TRACE("Peer amfnd status change: %d -> %d, peer
> node id is: %x, cluster size is %llu",
> + (int) cb->peer_sc_up, 1,
> svc_evt->i_node_id, (unsigned long long) cb->cluster_size);
> + cb->amfnd_down = false;
> + }
> break;
> default:
> break;
> }
> break;
>
> + case NCSMDS_RED_UP:
> + switch (svc_evt->i_svc_id) {
> + /* Depend on service downs if OpenSAF is not controling TIPC */
> + case NCSMDS_SVC_ID_IMMD:
> + if (svc_evt->i_node_id != cb->node_id) {
> + TRACE("Peer immd status change: %d -> %d, peer
> node id is: %x, cluster size is %llu",
> + (int) cb->peer_sc_up, 1,
> svc_evt->i_node_id, (unsigned long long) cb->cluster_size);
> + cb->peer_node_id = svc_evt->i_node_id;
> + cb->immd_down = false;
> +
> + fm_evt = m_MMGR_ALLOC_FM_EVT;
> + if (NULL == fm_evt) {
> + syslog(LOG_INFO, "fm_mds_svc_evt:
> fm_evt allocation FAILED.");
> + return NCSCC_RC_FAILURE;
> + }
> +
> + if(fm_fill_mds_evt_post_fm_mbx(cb, fm_evt,
> cb->peer_node_id, FM_EVT_PEER_UP) == NCSCC_RC_FAILURE)
> + {
> + m_MMGR_FREE_FM_EVT(fm_evt);
> + fm_evt = NULL;
> + }
> + }
> + break;
> +
> + case NCSMDS_SVC_ID_AVD:
> + if (svc_evt->i_node_id != cb->node_id) {
> + TRACE("Peer amfd status change: %d -> %d, peer
> node id is: %x, cluster size is %llu",
> + (int) cb->peer_sc_up, 1,
> svc_evt->i_node_id, (unsigned long long) cb->cluster_size);
> + cb->peer_node_id = svc_evt->i_node_id;
> + cb->amfd_down = false;
> +
> + fm_evt = m_MMGR_ALLOC_FM_EVT;
> + if (NULL == fm_evt) {
> + syslog(LOG_INFO, "fm_mds_svc_evt:
> fm_evt allocation FAILED.");
> + return NCSCC_RC_FAILURE;
> + }
> +
> + if(fm_fill_mds_evt_post_fm_mbx(cb, fm_evt,
> cb->peer_node_id, FM_EVT_PEER_UP) == NCSCC_RC_FAILURE)
> + {
> + m_MMGR_FREE_FM_EVT(fm_evt);
> + fm_evt = NULL;
> + }
> + }
> + break;
> +
> + default:
> + TRACE("Not interested in service down of other
> services");
> + break;
> + }
> + break;
> +
> default:
> syslog(LOG_INFO, "Wrong MDS event");
> break;
> }
>
> TRACE_LEAVE();
> - return return_val;
> + return NCSCC_RC_SUCCESS;
> }
>
> +
> /***************************************************************************
> * Name : fm_mds_rcv_evt
> *
>
> ------------------------------------------------------------------------------
> Check out the vibrant tech community on one of the world's most
> engaging tech sites, SlashDot.org! http://sdm.link/slashdot
> _______________________________________________
> Opensaf-devel mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/opensaf-devel
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel