Ack for this patch with comments, marked AndersW>
regards,
Anders Widell
On 01/23/2018 09:06 AM, Gary Lee wrote:
* consult with consensus service before promoting node to active
* add watch thread and self-fence if it detects active controller
has been changed (if remote fencing is disabled)
---
src/rde/Makefile.am | 3 ++-
src/rde/rded/osaf-rded.in | 4
src/rde/rded/rde_cb.h | 4 +++-
src/rde/rded/rde_main.cc | 38 +-
src/rde/rded/role.cc | 45 -
src/rde/rded/role.h | 3 +++
6 files changed, 89 insertions(+), 8 deletions(-)
diff --git a/src/rde/Makefile.am b/src/rde/Makefile.am
index c967f9fc4..182f347ab 100644
--- a/src/rde/Makefile.am
+++ b/src/rde/Makefile.am
@@ -58,7 +58,8 @@ bin_osafrded_SOURCES = \
bin_osafrded_LDADD = \
lib/libSaAmf.la \
- lib/libopensaf_core.la
+ lib/libopensaf_core.la \
+ lib/libosaf_common.la
bin_rdegetrole_CPPFLAGS = \
$(AM_CPPFLAGS)
diff --git a/src/rde/rded/osaf-rded.in b/src/rde/rded/osaf-rded.in
index 1c1786c8d..1697936a7 100644
--- a/src/rde/rded/osaf-rded.in
+++ b/src/rde/rded/osaf-rded.in
@@ -28,6 +28,10 @@ else
. $pkgsysconfdir/rde.conf
fi
+if [ -f "$pkgsysconfdir/fmd.conf" ]; then
+ . "$pkgsysconfdir/fmd.conf"
+fi
+
binary=$pkglibdir/$osafprog
pidfile=$pkgpiddir/$osafprog.pid
tracefile=$pkglogdir/$osafprog.log
diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h
index d2a3d46b2..fc100849a 100644
--- a/src/rde/rded/rde_cb.h
+++ b/src/rde/rded/rde_cb.h
@@ -39,13 +39,15 @@ struct RDE_CONTROL_BLOCK {
bool task_terminate;
RDE_RDA_CB rde_rda_cb;
RDE_AMF_CB rde_amf_cb;
+ bool monitor_lock_thread_running;
};
enum RDE_MSG_TYPE {
RDE_MSG_PEER_UP = 1,
RDE_MSG_PEER_DOWN = 2,
RDE_MSG_PEER_INFO_REQ = 3,
- RDE_MSG_PEER_INFO_RESP = 4
+ RDE_MSG_PEER_INFO_RESP = 4,
+ RDE_MSG_NEW_ACTIVE_CALLBACK = 5
};
struct rde_peer_info {
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index 0298bf3ff..082c1c040 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -28,6 +28,7 @@
#include
#include
#include
+#include "osaf/consensus/service.h"
#include "base/daemon.h"
#include "base/logtrace.h"
#include "base/osaf_poll.h"
@@ -37,6 +38,7 @@
#include
#include "rde/rded/rde_cb.h"
#include "rde/rded/role.h"
+#include "base/conf.h"
AndersW> Sort project include files alphabetically.
#define RDA_MAX_CLIENTS 32
@@ -92,10 +94,6 @@ static void handle_mbx_event() {
TRACE_ENTER();
msg = reinterpret_cast(ncs_ipc_non_blk_recv(_cb->mbx));
- TRACE("Received %s from node 0x%x with state %s. My state is %s",
-rde_msg_name[msg->type], msg->fr_node_id,
-Role::to_string(msg->info.peer_info.ha_role),
-Role::to_string(role->role()));
switch (msg->type) {
case RDE_MSG_PEER_INFO_REQ:
@@ -118,6 +116,34 @@ static void handle_mbx_event() {
case RDE_MSG_PEER_DOWN:
LOG_NO("Peer down on node 0x%x", msg->fr_node_id);
break;
+ case RDE_MSG_NEW_ACTIVE_CALLBACK:
+ {
+const std::string my_node = base::Conf::NodeName();
+rde_cb->monitor_lock_thread_running = false;
+
+// get current active controller
+Consensus consensus_service;
AndersW> Shouldn't the Consensus instance be created once, instead of
creating a new instance each time you receive this callback? The
Consensus constructor even logs to syslog (at INFO level).
+std::string active_controller = consensus_service.CurrentActive();
+
+LOG_NO("New active controller notification from consensus service");
+
+if (role->role() == PCS_RDA_ACTIVE) {
+ if (my_node.compare(active_controller) != 0) {
+// we are meant to be active, but consensus service doesn't think
so
+LOG_WA("Role does not match consensus service. New controller: %s",
+ active_controller.c_str());
+if (consensus_service.IsRemoteFencingEnabled() == false ) {
+ LOG_ER("Probable split-brain. Rebooting this node");
+ opensaf_reboot(0, nullptr, "Split-brain detected by consensus
service");
+}
+ }
+
+ // register for callback
+ rde_cb->monitor_lock_thread_running = true;
+ consensus_service.MonitorLock(Role::MonitorCallback, rde_cb->mbx);
+}
+ }
+ break;
default:
LOG_ER("%s: discarding unknown message type %u", __FUNCTION__,
msg->type);
break;
@@ -192,6 +218,7 @@ static int initialize_rde() {
goto init_failed;
}
+ rde_cb->monitor_lock_thread_running = false;
rc = NCSCC_RC_SUCCESS;
init_failed:
@@ -205,11 +232,12 @@ int main(int argc, char *argv[]) {
NCS_SEL_OBJ mbx_sel_obj;
RDE_RDA_CB *rde_rda_cb = _cb->rde_rda_cb;
int term_fd;
-
opensaf_reboot_prepare();