Ack for this patch with comments, marked AndersW>

regards,

Anders Widell


On 01/23/2018 09:06 AM, Gary Lee wrote:
* consult with consensus service before promoting node to active
* add watch thread and self-fence if it detects active controller
   has been changed (if remote fencing is disabled)
---
  src/rde/Makefile.am       |  3 ++-
  src/rde/rded/osaf-rded.in |  4 ++++
  src/rde/rded/rde_cb.h     |  4 +++-
  src/rde/rded/rde_main.cc  | 38 +++++++++++++++++++++++++++++++++-----
  src/rde/rded/role.cc      | 45 ++++++++++++++++++++++++++++++++++++++++++++-
  src/rde/rded/role.h       |  3 +++
  6 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/src/rde/Makefile.am b/src/rde/Makefile.am
index c967f9fc4..182f347ab 100644
--- a/src/rde/Makefile.am
+++ b/src/rde/Makefile.am
@@ -58,7 +58,8 @@ bin_osafrded_SOURCES = \
bin_osafrded_LDADD = \
        lib/libSaAmf.la \
-       lib/libopensaf_core.la
+       lib/libopensaf_core.la \
+       lib/libosaf_common.la
bin_rdegetrole_CPPFLAGS = \
        $(AM_CPPFLAGS)
diff --git a/src/rde/rded/osaf-rded.in b/src/rde/rded/osaf-rded.in
index 1c1786c8d..1697936a7 100644
--- a/src/rde/rded/osaf-rded.in
+++ b/src/rde/rded/osaf-rded.in
@@ -28,6 +28,10 @@ else
        . $pkgsysconfdir/rde.conf
  fi    
+if [ -f "$pkgsysconfdir/fmd.conf" ]; then
+  . "$pkgsysconfdir/fmd.conf"
+fi
+
  binary=$pkglibdir/$osafprog
  pidfile=$pkgpiddir/$osafprog.pid
  tracefile=$pkglogdir/$osafprog.log
diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h
index d2a3d46b2..fc100849a 100644
--- a/src/rde/rded/rde_cb.h
+++ b/src/rde/rded/rde_cb.h
@@ -39,13 +39,15 @@ struct RDE_CONTROL_BLOCK {
    bool task_terminate;
    RDE_RDA_CB rde_rda_cb;
    RDE_AMF_CB rde_amf_cb;
+  bool monitor_lock_thread_running;
  };
enum RDE_MSG_TYPE {
    RDE_MSG_PEER_UP = 1,
    RDE_MSG_PEER_DOWN = 2,
    RDE_MSG_PEER_INFO_REQ = 3,
-  RDE_MSG_PEER_INFO_RESP = 4
+  RDE_MSG_PEER_INFO_RESP = 4,
+  RDE_MSG_NEW_ACTIVE_CALLBACK = 5
  };
struct rde_peer_info {
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index 0298bf3ff..082c1c040 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -28,6 +28,7 @@
  #include <cerrno>
  #include <cstdlib>
  #include <cstring>
+#include "osaf/consensus/service.h"
  #include "base/daemon.h"
  #include "base/logtrace.h"
  #include "base/osaf_poll.h"
@@ -37,6 +38,7 @@
  #include <saAmf.h>
  #include "rde/rded/rde_cb.h"
  #include "rde/rded/role.h"
+#include "base/conf.h"
AndersW> Sort project include files alphabetically.
#define RDA_MAX_CLIENTS 32 @@ -92,10 +94,6 @@ static void handle_mbx_event() {
    TRACE_ENTER();
msg = reinterpret_cast<rde_msg *>(ncs_ipc_non_blk_recv(&rde_cb->mbx));
-  TRACE("Received %s from node 0x%x with state %s. My state is %s",
-        rde_msg_name[msg->type], msg->fr_node_id,
-        Role::to_string(msg->info.peer_info.ha_role),
-        Role::to_string(role->role()));
switch (msg->type) {
      case RDE_MSG_PEER_INFO_REQ:
@@ -118,6 +116,34 @@ static void handle_mbx_event() {
      case RDE_MSG_PEER_DOWN:
        LOG_NO("Peer down on node 0x%x", msg->fr_node_id);
        break;
+   case RDE_MSG_NEW_ACTIVE_CALLBACK:
+      {
+        const std::string my_node = base::Conf::NodeName();
+        rde_cb->monitor_lock_thread_running = false;
+
+        // get current active controller
+        Consensus consensus_service;
AndersW> Shouldn't the Consensus instance be created once, instead of creating a new instance each time you receive this callback? The Consensus constructor even logs to syslog (at INFO level).
+        std::string active_controller = consensus_service.CurrentActive();
+
+        LOG_NO("New active controller notification from consensus service");
+
+        if (role->role() == PCS_RDA_ACTIVE) {
+          if (my_node.compare(active_controller) != 0) {
+            // we are meant to be active, but consensus service doesn't think 
so
+            LOG_WA("Role does not match consensus service. New controller: %s",
+              active_controller.c_str());
+            if (consensus_service.IsRemoteFencingEnabled() == false ) {
+              LOG_ER("Probable split-brain. Rebooting this node");
+              opensaf_reboot(0, nullptr, "Split-brain detected by consensus 
service");
+            }
+          }
+
+          // register for callback
+          rde_cb->monitor_lock_thread_running = true;
+          consensus_service.MonitorLock(Role::MonitorCallback, rde_cb->mbx);
+        }
+      }
+      break;
      default:
        LOG_ER("%s: discarding unknown message type %u", __FUNCTION__, 
msg->type);
        break;
@@ -192,6 +218,7 @@ static int initialize_rde() {
      goto init_failed;
    }
+ rde_cb->monitor_lock_thread_running = false;
    rc = NCSCC_RC_SUCCESS;
init_failed:
@@ -205,11 +232,12 @@ int main(int argc, char *argv[]) {
    NCS_SEL_OBJ mbx_sel_obj;
    RDE_RDA_CB *rde_rda_cb = &rde_cb->rde_rda_cb;
    int term_fd;
-
    opensaf_reboot_prepare();
daemonize(argc, argv); + base::Conf::InitNodeName();
+
    if (initialize_rde() != NCSCC_RC_SUCCESS) goto init_failed;
mbx_sel_obj = ncs_ipc_get_sel_obj(&rde_cb->mbx);
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index f7511f0d8..c821aeb33 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -27,7 +27,9 @@
  #include "base/process.h"
  #include "base/time.h"
  #include "base/ncs_main_papi.h"
+#include "base/ncssysf_def.h"
  #include "rde/rded/rde_cb.h"
+#include "osaf/consensus/service.h"
AndersW> Sort project include files alphabetically.
const char* const Role::role_names_[] = {"Undefined", "ACTIVE", "STANDBY",
                                           "QUIESCED",  "QUIESCING", "Invalid"};
@@ -42,6 +44,20 @@ const char* Role::to_string(PCS_RDA_ROLE role) {
               : role_names_[0];
  }
+void Role::MonitorCallback(const std::string& key,
+  const std::string& new_value, SYSF_MBX mbx)
+{
+  TRACE_ENTER();
+
+  rde_msg* msg = static_cast<rde_msg *>(malloc(sizeof(rde_msg)));
+  msg->type = RDE_MSG_NEW_ACTIVE_CALLBACK;
+
+  uint32_t status;
+  status = m_NCS_IPC_SEND(&mbx,
+    msg, NCS_IPC_PRIORITY_NORMAL);
+  osafassert(status == NCSCC_RC_SUCCESS);
+}
+
  Role::Role(NODE_ID own_node_id)
      : known_nodes_{},
        role_{PCS_RDA_QUIESCED},
@@ -61,10 +77,26 @@ timespec* Role::Poll(timespec* ts) {
        *ts = election_end_time_ - now;
        timeout = ts;
      } else {
+      SaAisErrorT rc;
+      Consensus consensus_service;
+      rc = consensus_service.PromoteThisNode();
+      if (rc != SA_AIS_OK) {
+        LOG_ER("Unable to set active controller in consensus service");
+        opensaf_reboot(0, nullptr, "Unable to set active controller in consensus 
service");
+      }
+
        ExecutePreActiveScript();
        LOG_NO("Switched to ACTIVE from %s", to_string(role()));
        role_ = PCS_RDA_ACTIVE;
        rde_rda_send_role(role_);
+
+      // register for callback if active controller is changed
+      // in consensus service
+      RDE_CONTROL_BLOCK* cb = rde_get_control_block();
+      if (cb->monitor_lock_thread_running == false) {
+        cb->monitor_lock_thread_running = true;
+        consensus_service.MonitorLock(MonitorCallback, cb->mbx);
+      }
      }
    }
    return timeout;
@@ -91,7 +123,18 @@ uint32_t Role::SetRole(PCS_RDA_ROLE new_role) {
    }
    if (new_role != old_role) {
      LOG_NO("RDE role set to %s", to_string(new_role));
-    if (new_role == PCS_RDA_ACTIVE) ExecutePreActiveScript();
+    if (new_role == PCS_RDA_ACTIVE) {
+      ExecutePreActiveScript();
+
+      // register for callback if active controller is changed
+      // in consensus service
+      Consensus consensus_service;
+      RDE_CONTROL_BLOCK* cb = rde_get_control_block();
+      if (cb->monitor_lock_thread_running == false) {
+        cb->monitor_lock_thread_running = true;
+        consensus_service.MonitorLock(MonitorCallback, cb->mbx);
+      }
+    }
      role_ = new_role;
      if (new_role == PCS_RDA_UNDEFINED) {
        known_nodes_.clear();
diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h
index 20219b535..bee983828 100644
--- a/src/rde/rded/role.h
+++ b/src/rde/rded/role.h
@@ -21,6 +21,7 @@
  #include <time.h>
  #include <cstdint>
  #include <set>
+#include <string>
  #include "base/macros.h"
  #include "mds/mds_papi.h"
  #include "rde/agent/rda_papi.h"
@@ -38,6 +39,8 @@ class Role {
    uint32_t SetRole(PCS_RDA_ROLE new_role);
    PCS_RDA_ROLE role() const;
    static const char* to_string(PCS_RDA_ROLE role);
+  static void MonitorCallback(const std::string& key,
+    const std::string& new_value, SYSF_MBX mbx);
private:
    static const uint64_t kDefaultDiscoverPeerTimeout = 2000;


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to