* consult with consensus service before promoting node to active * add watch thread and self-fence if it detects active controller has been changed (if remote fencing is disabled) --- src/rde/Makefile.am | 3 ++- src/rde/rded/osaf-rded.in | 4 ++++ src/rde/rded/rde_cb.h | 4 +++- src/rde/rded/rde_main.cc | 38 +++++++++++++++++++++++++++++++++----- src/rde/rded/role.cc | 45 ++++++++++++++++++++++++++++++++++++++++++++- src/rde/rded/role.h | 3 +++ 6 files changed, 89 insertions(+), 8 deletions(-)
diff --git a/src/rde/Makefile.am b/src/rde/Makefile.am index c967f9fc4..182f347ab 100644 --- a/src/rde/Makefile.am +++ b/src/rde/Makefile.am @@ -58,7 +58,8 @@ bin_osafrded_SOURCES = \ bin_osafrded_LDADD = \ lib/libSaAmf.la \ - lib/libopensaf_core.la + lib/libopensaf_core.la \ + lib/libosaf_common.la bin_rdegetrole_CPPFLAGS = \ $(AM_CPPFLAGS) diff --git a/src/rde/rded/osaf-rded.in b/src/rde/rded/osaf-rded.in index 1c1786c8d..1697936a7 100644 --- a/src/rde/rded/osaf-rded.in +++ b/src/rde/rded/osaf-rded.in @@ -28,6 +28,10 @@ else . $pkgsysconfdir/rde.conf fi +if [ -f "$pkgsysconfdir/fmd.conf" ]; then + . "$pkgsysconfdir/fmd.conf" +fi + binary=$pkglibdir/$osafprog pidfile=$pkgpiddir/$osafprog.pid tracefile=$pkglogdir/$osafprog.log diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h index d2a3d46b2..fc100849a 100644 --- a/src/rde/rded/rde_cb.h +++ b/src/rde/rded/rde_cb.h @@ -39,13 +39,15 @@ struct RDE_CONTROL_BLOCK { bool task_terminate; RDE_RDA_CB rde_rda_cb; RDE_AMF_CB rde_amf_cb; + bool monitor_lock_thread_running; }; enum RDE_MSG_TYPE { RDE_MSG_PEER_UP = 1, RDE_MSG_PEER_DOWN = 2, RDE_MSG_PEER_INFO_REQ = 3, - RDE_MSG_PEER_INFO_RESP = 4 + RDE_MSG_PEER_INFO_RESP = 4, + RDE_MSG_NEW_ACTIVE_CALLBACK = 5 }; struct rde_peer_info { diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc index 0298bf3ff..082c1c040 100644 --- a/src/rde/rded/rde_main.cc +++ b/src/rde/rded/rde_main.cc @@ -28,6 +28,7 @@ #include <cerrno> #include <cstdlib> #include <cstring> +#include "osaf/consensus/service.h" #include "base/daemon.h" #include "base/logtrace.h" #include "base/osaf_poll.h" @@ -37,6 +38,7 @@ #include <saAmf.h> #include "rde/rded/rde_cb.h" #include "rde/rded/role.h" +#include "base/conf.h" #define RDA_MAX_CLIENTS 32 @@ -92,10 +94,6 @@ static void handle_mbx_event() { TRACE_ENTER(); msg = reinterpret_cast<rde_msg *>(ncs_ipc_non_blk_recv(&rde_cb->mbx)); - TRACE("Received %s from node 0x%x with state %s. My state is %s", - rde_msg_name[msg->type], msg->fr_node_id, - Role::to_string(msg->info.peer_info.ha_role), - Role::to_string(role->role())); switch (msg->type) { case RDE_MSG_PEER_INFO_REQ: @@ -118,6 +116,34 @@ static void handle_mbx_event() { case RDE_MSG_PEER_DOWN: LOG_NO("Peer down on node 0x%x", msg->fr_node_id); break; + case RDE_MSG_NEW_ACTIVE_CALLBACK: + { + const std::string my_node = base::Conf::NodeName(); + rde_cb->monitor_lock_thread_running = false; + + // get current active controller + Consensus consensus_service; + std::string active_controller = consensus_service.CurrentActive(); + + LOG_NO("New active controller notification from consensus service"); + + if (role->role() == PCS_RDA_ACTIVE) { + if (my_node.compare(active_controller) != 0) { + // we are meant to be active, but consensus service doesn't think so + LOG_WA("Role does not match consensus service. New controller: %s", + active_controller.c_str()); + if (consensus_service.IsRemoteFencingEnabled() == false ) { + LOG_ER("Probable split-brain. Rebooting this node"); + opensaf_reboot(0, nullptr, "Split-brain detected by consensus service"); + } + } + + // register for callback + rde_cb->monitor_lock_thread_running = true; + consensus_service.MonitorLock(Role::MonitorCallback, rde_cb->mbx); + } + } + break; default: LOG_ER("%s: discarding unknown message type %u", __FUNCTION__, msg->type); break; @@ -192,6 +218,7 @@ static int initialize_rde() { goto init_failed; } + rde_cb->monitor_lock_thread_running = false; rc = NCSCC_RC_SUCCESS; init_failed: @@ -205,11 +232,12 @@ int main(int argc, char *argv[]) { NCS_SEL_OBJ mbx_sel_obj; RDE_RDA_CB *rde_rda_cb = &rde_cb->rde_rda_cb; int term_fd; - opensaf_reboot_prepare(); daemonize(argc, argv); + base::Conf::InitNodeName(); + if (initialize_rde() != NCSCC_RC_SUCCESS) goto init_failed; mbx_sel_obj = ncs_ipc_get_sel_obj(&rde_cb->mbx); diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc index f7511f0d8..c821aeb33 100644 --- a/src/rde/rded/role.cc +++ b/src/rde/rded/role.cc @@ -27,7 +27,9 @@ #include "base/process.h" #include "base/time.h" #include "base/ncs_main_papi.h" +#include "base/ncssysf_def.h" #include "rde/rded/rde_cb.h" +#include "osaf/consensus/service.h" const char* const Role::role_names_[] = {"Undefined", "ACTIVE", "STANDBY", "QUIESCED", "QUIESCING", "Invalid"}; @@ -42,6 +44,20 @@ const char* Role::to_string(PCS_RDA_ROLE role) { : role_names_[0]; } +void Role::MonitorCallback(const std::string& key, + const std::string& new_value, SYSF_MBX mbx) +{ + TRACE_ENTER(); + + rde_msg* msg = static_cast<rde_msg *>(malloc(sizeof(rde_msg))); + msg->type = RDE_MSG_NEW_ACTIVE_CALLBACK; + + uint32_t status; + status = m_NCS_IPC_SEND(&mbx, + msg, NCS_IPC_PRIORITY_NORMAL); + osafassert(status == NCSCC_RC_SUCCESS); +} + Role::Role(NODE_ID own_node_id) : known_nodes_{}, role_{PCS_RDA_QUIESCED}, @@ -61,10 +77,26 @@ timespec* Role::Poll(timespec* ts) { *ts = election_end_time_ - now; timeout = ts; } else { + SaAisErrorT rc; + Consensus consensus_service; + rc = consensus_service.PromoteThisNode(); + if (rc != SA_AIS_OK) { + LOG_ER("Unable to set active controller in consensus service"); + opensaf_reboot(0, nullptr, "Unable to set active controller in consensus service"); + } + ExecutePreActiveScript(); LOG_NO("Switched to ACTIVE from %s", to_string(role())); role_ = PCS_RDA_ACTIVE; rde_rda_send_role(role_); + + // register for callback if active controller is changed + // in consensus service + RDE_CONTROL_BLOCK* cb = rde_get_control_block(); + if (cb->monitor_lock_thread_running == false) { + cb->monitor_lock_thread_running = true; + consensus_service.MonitorLock(MonitorCallback, cb->mbx); + } } } return timeout; @@ -91,7 +123,18 @@ uint32_t Role::SetRole(PCS_RDA_ROLE new_role) { } if (new_role != old_role) { LOG_NO("RDE role set to %s", to_string(new_role)); - if (new_role == PCS_RDA_ACTIVE) ExecutePreActiveScript(); + if (new_role == PCS_RDA_ACTIVE) { + ExecutePreActiveScript(); + + // register for callback if active controller is changed + // in consensus service + Consensus consensus_service; + RDE_CONTROL_BLOCK* cb = rde_get_control_block(); + if (cb->monitor_lock_thread_running == false) { + cb->monitor_lock_thread_running = true; + consensus_service.MonitorLock(MonitorCallback, cb->mbx); + } + } role_ = new_role; if (new_role == PCS_RDA_UNDEFINED) { known_nodes_.clear(); diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h index 20219b535..bee983828 100644 --- a/src/rde/rded/role.h +++ b/src/rde/rded/role.h @@ -21,6 +21,7 @@ #include <time.h> #include <cstdint> #include <set> +#include <string> #include "base/macros.h" #include "mds/mds_papi.h" #include "rde/agent/rda_papi.h" @@ -38,6 +39,8 @@ class Role { uint32_t SetRole(PCS_RDA_ROLE new_role); PCS_RDA_ROLE role() const; static const char* to_string(PCS_RDA_ROLE role); + static void MonitorCallback(const std::string& key, + const std::string& new_value, SYSF_MBX mbx); private: static const uint64_t kDefaultDiscoverPeerTimeout = 2000; -- 2.14.1 ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel