--- src/fm/fmd/fm_cb.h | 2 +- src/fm/fmd/fm_main.cc | 26 +++++--------------------- src/fm/fmd/fm_mds.cc | 2 ++ src/fm/fmd/fm_rda.cc | 27 ++++++++++++++++++++++----- 4 files changed, 30 insertions(+), 27 deletions(-)
diff --git a/src/fm/fmd/fm_cb.h b/src/fm/fmd/fm_cb.h index cfa50d883..010ab735a 100644 --- a/src/fm/fmd/fm_cb.h +++ b/src/fm/fmd/fm_cb.h @@ -100,7 +100,7 @@ struct FM_CB { std::atomic<bool> peer_sc_up{false}; bool well_connected{false}; - uint64_t cluster_size{}; + std::atomic<uint64_t> cluster_size{}; struct timespec last_well_connected{}; struct timespec node_isolation_timeout{}; SaClmHandleT clm_hdl{}; diff --git a/src/fm/fmd/fm_main.cc b/src/fm/fmd/fm_main.cc index 73c9b9ccd..3371ec5e8 100644 --- a/src/fm/fmd/fm_main.cc +++ b/src/fm/fmd/fm_main.cc @@ -551,21 +551,12 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT *fm_mbx_evt) { * trigerred quicker than the node_down event * has been received. */ - if (fm_cb->role == PCS_RDA_STANDBY) { - const std::string current_active = - consensus_service.CurrentActive(); - if (current_active.compare(osaf_extended_name_borrow( - &fm_cb->peer_clm_node_name)) == 0) { - // update consensus service, before fencing old active controller - consensus_service.DemoteCurrentActive(); - } - } if (fm_cb->use_remote_fencing) { if (fm_cb->peer_node_terminated == false) { // if peer_sc_up is true then // the node has come up already - if (fm_cb->peer_sc_up == false && fm_cb->immnd_down == true) { + if (consensus_service.IsEnabled() == false) { opensaf_reboot(fm_cb->peer_node_id, (char *)fm_cb->peer_clm_node_name.value, "Received Node Down for peer controller"); @@ -580,8 +571,7 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT *fm_mbx_evt) { fm_cb->mutex_.Lock(); peer_node_name = fm_cb->peer_node_name; fm_cb->mutex_.Unlock(); - opensaf_reboot(fm_cb->peer_node_id, - peer_node_name.c_str(), + opensaf_reboot(fm_cb->peer_node_id, peer_node_name.c_str(), "Received Node Down for peer controller"); } if (!((fm_cb->role == PCS_RDA_ACTIVE) && @@ -632,12 +622,6 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT *fm_mbx_evt) { } Consensus consensus_service; - const std::string current_active = consensus_service.CurrentActive(); - if (current_active.compare( - osaf_extended_name_borrow(&fm_cb->peer_clm_node_name)) == 0) { - // update consensus service, before fencing old active controller - consensus_service.DemoteCurrentActive(); - } /* Now. Try resetting other blade */ fm_cb->role = PCS_RDA_ACTIVE; @@ -645,7 +629,8 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT *fm_mbx_evt) { LOG_NO("Reseting peer controller node id: %x", unsigned(fm_cb->peer_node_id)); if (fm_cb->use_remote_fencing) { - if (fm_cb->peer_node_terminated == false) { + if (fm_cb->peer_node_terminated == false && + consensus_service.IsEnabled() == false) { opensaf_reboot(fm_cb->peer_node_id, (char *)fm_cb->peer_clm_node_name.value, "Received Node Down for peer controller"); @@ -658,8 +643,7 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT *fm_mbx_evt) { fm_cb->mutex_.Lock(); peer_node_name = fm_cb->peer_node_name; fm_cb->mutex_.Unlock(); - opensaf_reboot(fm_cb->peer_node_id, - peer_node_name.c_str(), + opensaf_reboot(fm_cb->peer_node_id, peer_node_name.c_str(), "Received Node Down for Active peer"); } fm_rda_set_role(fm_cb, PCS_RDA_ACTIVE); diff --git a/src/fm/fmd/fm_mds.cc b/src/fm/fmd/fm_mds.cc index 277a357d2..60db5dab1 100644 --- a/src/fm/fmd/fm_mds.cc +++ b/src/fm/fmd/fm_mds.cc @@ -373,6 +373,7 @@ static uint32_t fm_mds_node_evt(FM_CB *cb, case NCSMDS_NODE_DOWN: if (cb->cluster_size != 0) { --cb->cluster_size; + TRACE("cluster_size %llu", (unsigned long long)cb->cluster_size); TRACE("Node down event for node id %x, cluster size is now: %llu", node_evt->node_id, (unsigned long long)cb->cluster_size); check_for_node_isolation(cb); @@ -397,6 +398,7 @@ static uint32_t fm_mds_node_evt(FM_CB *cb, case NCSMDS_NODE_UP: ++cb->cluster_size; + TRACE("cluster_size %llu", (unsigned long long)cb->cluster_size); TRACE("Node up event for node id %x, cluster size is now: %llu", node_evt->node_id, (unsigned long long)cb->cluster_size); check_for_node_isolation(cb); diff --git a/src/fm/fmd/fm_rda.cc b/src/fm/fmd/fm_rda.cc index 47e1f1d32..af337f868 100644 --- a/src/fm/fmd/fm_rda.cc +++ b/src/fm/fmd/fm_rda.cc @@ -87,11 +87,28 @@ uint32_t fm_rda_set_role(FM_CB *fm_cb, PCS_RDA_ROLE role) { osafassert(role == PCS_RDA_ACTIVE); Consensus consensus_service; - rc = consensus_service.PromoteThisNode(); - if (rc != SA_AIS_OK) { - LOG_ER("Unable to set active controller in consensus service"); - opensaf_reboot(0, nullptr, - "Unable to set active controller in consensus service"); + if (consensus_service.IsEnabled() == true) { + // Allow topology events to be processed first. The MDS thread may + // be processing MDS down events and updating cluster_size concurrently. + // We need cluster_size to be as accurate as possible, without waiting + // too long for node down events. + std::this_thread::sleep_for(std::chrono::seconds(3)); + + rc = consensus_service.PromoteThisNode(true, fm_cb->cluster_size); + if (rc != SA_AIS_OK && rc != SA_AIS_ERR_EXIST) { + LOG_ER("Unable to set active controller in consensus service"); + opensaf_reboot(0, nullptr, + "Unable to set active controller in consensus service"); + } else if (rc == SA_AIS_ERR_EXIST) { + // @todo if we don't reboot, we don't seem to recover from this. Can we + // improve? + LOG_ER( + "A controller is already active. We were separated from the " + "cluster?"); + opensaf_reboot(0, nullptr, + "A controller is already active. We were separated " + "from the cluster?"); + } } rc = pcs_rda_request(&rda_req); -- 2.14.1 ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel