---
00-README.conf | 43 ++++++
Makefile.am | 4 +-
src/amf/amfd/osaf-amfd.in | 4 +
src/amf/amfd/role.cc | 35 ++++-
src/fm/Makefile.am | 1 +
src/fm/fmd/fm_main.cc | 12 ++
src/fm/fmd/fm_rda.cc | 26 ++++
src/fm/fmd/fmd.conf | 8 ++
src/osaf/Makefile.am | 8 +-
src/osaf/consensus/Makefile | 18 +++
src/osaf/consensus/keyvalue.cc | 165 ++++++++++++++++++++++
src/osaf/consensus/keyvalue.h | 57 ++++++++
src/osaf/consensus/plugins/etcd.plugin | 217 +++++++++++++++++++++++++++++
src/osaf/consensus/plugins/sample.plugin | 162 ++++++++++++++++++++++
src/osaf/consensus/service.cc | 231 +++++++++++++++++++++++++++++++
src/osaf/consensus/service.h | 66 +++++++++
src/rde/Makefile.am | 3 +-
src/rde/rded/osaf-rded.in | 4 +
src/rde/rded/rde_cb.h | 3 +-
src/rde/rded/rde_main.cc | 32 ++++-
src/rde/rded/role.cc | 45 +++++-
src/rde/rded/role.h | 2 +
22 files changed, 1130 insertions(+), 16 deletions(-)
create mode 100644 src/osaf/consensus/Makefile
create mode 100644 src/osaf/consensus/keyvalue.cc
create mode 100644 src/osaf/consensus/keyvalue.h
create mode 100644 src/osaf/consensus/plugins/etcd.plugin
create mode 100644 src/osaf/consensus/plugins/sample.plugin
create mode 100644 src/osaf/consensus/service.cc
create mode 100644 src/osaf/consensus/service.h
diff --git a/00-README.conf b/00-README.conf
index 5de286225..24d583a89 100644
--- a/00-README.conf
+++ b/00-README.conf
@@ -654,3 +654,46 @@ on each node, except on the active node. This file
indicates that a cluster
reboot is in progress and all nodes needs to delay their start, this to give
the active a lead.
+Split-Brain Prevention with Consensus Service
+=============================================
+
+Split-brain prevention is supported in OpenSAF through a pluggable arbitration
+interface, to implement a consensus service. In network partitions containing
+half of the nodes or less, a controller cannot become active thus preventing
+a split-brain scenario. When the network once again merges such that consensus
can
+be formed, an active controller will be chosen from the controllers that have
write
+access to the consensus service. This is assuming that each node
+of the cluster participates in the consensus service.
+
+To enable split-brain prevention, edit fmd.conf and update accordingly:
+
+export SPLIT_BRAIN_PREVENTION=1
+export KEYVALUE_STORE_PLUGIN_CMD=/usr/local/lib/opensaf/etcd.plugin
+
+The plugin must implement a key-value store interface.
+
+An example plugin is provided for etcd, an implemention of RAFT. The plugin
assumes
+etcd is installed and available on system controllers. In clusters where
+there are only two system controllers, it is highly recommended to configure
etcd so
+it runs on at least three nodes to facilitate a majority vote with failure
tolerance.
+
+Other implementations of a distributed key-value store service
+can be used, provided as it implements the interface documented in
sample.plugin.
+
+get <key> - returns <value> of <key> in key-value store
+set <key> <value> - set <key> to <value> in key-value store
+erase <key> - erase <key> from key-value store
+lock <timeout> - distributed lock with automatic unlock after <timeout> seconds
+unlock - distributed unlock
+lock_owner - returns owner of lock
+watch <key> - returns new <value> of <key> when the value is changed
+
+The key-value store does not need to reside on the same nodes as OpenSAF.
+In such a configuration, an appropriate plugin that handles
+the communication with a remotely located key-value store, must be provided.
+
+If remote fencing is enabled, then it will be used to fence a node that the
consensus
+service believes should not be active. Otherwise, rded will initiate a
'self-fencing'
+by rebooting the node, if it determines the node is no longer active
+according to the consensus service.
+
diff --git a/Makefile.am b/Makefile.am
index bcfd844cd..57c2585a8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -159,7 +159,9 @@ dist_osaf_execbin_SCRIPTS += \
$(top_srcdir)/scripts/opensaf_reboot \
$(top_srcdir)/scripts/opensaf_sc_active \
$(top_srcdir)/scripts/opensaf_scale_out \
- $(top_srcdir)/scripts/plm_scale_out
+ $(top_srcdir)/scripts/plm_scale_out \
+ $(top_srcdir)/src/osaf/consensus/plugins/etcd.plugin
+# TODO remove above line before pushing
include $(top_srcdir)/src/ais/Makefile.am
include $(top_srcdir)/src/base/Makefile.am
diff --git a/src/amf/amfd/osaf-amfd.in b/src/amf/amfd/osaf-amfd.in
index 45c5ab9e4..26a77ef52 100644
--- a/src/amf/amfd/osaf-amfd.in
+++ b/src/amf/amfd/osaf-amfd.in
@@ -28,6 +28,10 @@ else
. $pkgsysconfdir/amfd.conf
fi
+if [ -f "$pkgsysconfdir/fmd.conf" ]; then
+ . "$pkgsysconfdir/fmd.conf"
+fi
+
binary=$pkglibdir/$osafprog
pidfile=$pkgpiddir/$osafprog.pid
lockfile=$lockdir/$initscript
diff --git a/src/amf/amfd/role.cc b/src/amf/amfd/role.cc
index 865d89d94..f98beea01 100644
--- a/src/amf/amfd/role.cc
+++ b/src/amf/amfd/role.cc
@@ -38,6 +38,7 @@
#include "osaf/immutil/immutil.h"
#include "base/logtrace.h"
#include "rde/agent/rda_papi.h"
+#include "osaf/consensus/service.h"
#include "amf/amfd/amfd.h"
#include "amf/amfd/imm.h"
@@ -1085,6 +1086,12 @@ uint32_t amfd_switch_actv_qsd(AVD_CL_CB *cb) {
avd_d2n_msg_dequeue(cb);
}
+ Consensus consensus_service;
+ rc = consensus_service.DemoteThisNode();
+ if (rc != SA_AIS_OK) {
+ LOG_ER("Failed to demote this node from consensus service");
+ }
+
TRACE_LEAVE();
return NCSCC_RC_SUCCESS;
}
@@ -1209,13 +1216,21 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) {
cb->avail_state_avd = SA_AMF_HA_ACTIVE;
osaf_mutex_unlock_ordie(&imm_reinit_mutex);
+ Consensus consensus_service;
+ rc = consensus_service.BeginActivePromotion();
+ if (rc != SA_AIS_OK) {
+ LOG_ER("Unable to set active controller in consensus service");
+ LOG_ER("Split brain is possible");
+ }
+
/* Declare this standby as Active. Set Vdest role role */
if (NCSCC_RC_SUCCESS !=
(status = avd_mds_set_vdest_role(cb, SA_AMF_HA_ACTIVE))) {
LOG_ER("Switch Standby --> Active FAILED, MDS role set failed");
cb->swap_switch = false;
avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE);
- return NCSCC_RC_FAILURE;
+ status = NCSCC_RC_FAILURE;
+ goto done;
}
/* Time to send fail-over messages to all the AVND's */
@@ -1240,7 +1255,8 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) {
} else {
cb->swap_switch = false;
avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE);
- return NCSCC_RC_FAILURE;
+ status = NCSCC_RC_FAILURE;
+ goto done;
}
}
@@ -1259,7 +1275,8 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) {
in avd_imm_reinit_bg_thread.*/
} else {
avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE);
- return NCSCC_RC_FAILURE;
+ status = NCSCC_RC_FAILURE;
+ goto done;
}
} else
osaf_mutex_unlock_ordie(&imm_reinit_mutex);
@@ -1274,7 +1291,8 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) {
LOG_ER("Switch Standby --> Active, clm track start failed");
Fifo::queue(new ClmTrackStart());
avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE);
- return NCSCC_RC_FAILURE;
+ status = NCSCC_RC_FAILURE;
+ goto done;
}
/* Send the message to other avd for role change rsp as success */
@@ -1291,8 +1309,15 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) {
}
}
+ status = NCSCC_RC_SUCCESS;
+done:
+ rc = consensus_service.EndActivePromotion();
+ if (rc != SA_AIS_OK) {
+ LOG_ER("Unable to remove lock in consensus service");
+ }
+
TRACE_LEAVE();
- return NCSCC_RC_SUCCESS;
+ return status;
}
/****************************************************************************\
diff --git a/src/fm/Makefile.am b/src/fm/Makefile.am
index d48a9146c..0f254b94f 100644
--- a/src/fm/Makefile.am
+++ b/src/fm/Makefile.am
@@ -49,4 +49,5 @@ bin_osaffmd_SOURCES = \
bin_osaffmd_LDADD = \
lib/libSaAmf.la \
lib/libSaClm.la \
+ lib/libosaf_common.la \
lib/libopensaf_core.la
diff --git a/src/fm/fmd/fm_main.cc b/src/fm/fmd/fm_main.cc
index db8395ee7..1d3cba766 100644
--- a/src/fm/fmd/fm_main.cc
+++ b/src/fm/fmd/fm_main.cc
@@ -24,6 +24,7 @@ This file contains the main() routine for FM.
******************************************************************************/
#include "osaf/configmake.h"
+#include "osaf/consensus/service.h"
#include <stdlib.h>
#include <stdbool.h>
#include "base/daemon.h"
@@ -593,6 +594,12 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT
*fm_mbx_evt)
* trigerred quicker than the node_down event
* has been received.
*/
+ if (fm_cb->role == PCS_RDA_STANDBY) {
+ // update consensus service, before
fencing old active controller
+ Consensus consensus_service;
+ consensus_service.DemoteCurrentActive();
+ }
+
if (fm_cb->use_remote_fencing) {
if (fm_cb->peer_node_terminated ==
false) {
@@ -659,6 +666,11 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, FM_EVT
*fm_mbx_evt)
0, NULL,
"Failover occurred, but this node is not
yet ready");
}
+
+ // update consensus service, before fencing old active
controller
+ Consensus consensus_service;
+ consensus_service.DemoteCurrentActive();
+
/* Now. Try resetting other blade */
fm_cb->role = PCS_RDA_ACTIVE;
diff --git a/src/fm/fmd/fm_rda.cc b/src/fm/fmd/fm_rda.cc
index 5c1b33e2f..0cec70a05 100644
--- a/src/fm/fmd/fm_rda.cc
+++ b/src/fm/fmd/fm_rda.cc
@@ -19,7 +19,9 @@
#include <string.h>
#include <syslog.h>
#include "rde/agent/rda_papi.h"
+#include "osaf/consensus/service.h"
#include "base/logtrace.h"
+
extern void rda_cb(uint32_t cb_hdl, PCS_RDA_CB_INFO *cb_info,
PCSRDA_RETURN_CODE error_code);
/****************************************************************************
@@ -83,8 +85,27 @@ uint32_t fm_rda_set_role(FM_CB *fm_cb, PCS_RDA_ROLE role)
rda_req.req_type = PCS_RDA_SET_ROLE;
rda_req.info.io_role = role;
+ osafassert(role == PCS_RDA_ACTIVE);
+
+ Consensus consensus_service;
+ rc = consensus_service.DemoteCurrentActive();
+ if (rc != SA_AIS_OK) {
+ LOG_ER("Failed to demote old active node from consensus
service");
+ }
+
+ rc = consensus_service.BeginActivePromotion();
+ if (rc != SA_AIS_OK) {
+ LOG_ER("Unable to set active controller in consensus service");
+ LOG_ER("Split brain is possible");
+ }
+
rc = pcs_rda_request(&rda_req);
if (rc != PCSRDA_RC_SUCCESS) {
+ rc = consensus_service.EndActivePromotion();
+ if (rc != SA_AIS_OK) {
+ LOG_ER("Unable to remove lock in consensus service");
+ }
+
syslog(
LOG_INFO,
"fm_rda_set_role() Failed: CurrentState: %s, AskedState:
%s",
@@ -92,6 +113,11 @@ uint32_t fm_rda_set_role(FM_CB *fm_cb, PCS_RDA_ROLE role)
return NCSCC_RC_FAILURE;
}
+ rc = consensus_service.EndActivePromotion();
+ if (rc != SA_AIS_OK) {
+ LOG_ER("Unable to remove lock in consensus service");
+ }
+
syslog(LOG_INFO,
"fm_rda_set_role() Success: CurrentState: %s, AskedState: %s",
role_string[fm_cb->role], role_string[role]);
diff --git a/src/fm/fmd/fmd.conf b/src/fm/fmd/fmd.conf
index 6921017f7..521bfcabc 100644
--- a/src/fm/fmd/fmd.conf
+++ b/src/fm/fmd/fmd.conf
@@ -26,6 +26,14 @@ export FMS_NODE_ISOLATION_TIMEOUT=0
# To enable remote fencing change to 1
export FMS_USE_REMOTE_FENCING=0
+# TODO: change before pushing
+#export SPLIT_BRAIN_PREVENTION=0
+export SPLIT_BRAIN_PREVENTION=1
+
+# TODO: change before pushing
+#export KEYVALUE_STORE_PLUGIN_CMD=
+export KEYVALUE_STORE_PLUGIN_CMD=/usr/local/lib/opensaf/etcd.plugin
+
# FM will supervise transitions to the ACTIVE role when this variable is set to
# a non-zero value. The value is the time in the unit of 10 ms to wait for a
# role change to ACTIVE to take effect. If AMF has not give FM an active
diff --git a/src/osaf/Makefile.am b/src/osaf/Makefile.am
index 05b78c988..10bbe427b 100644
--- a/src/osaf/Makefile.am
+++ b/src/osaf/Makefile.am
@@ -16,7 +16,9 @@
noinst_HEADERS += \
src/osaf/immutil/immutil.h \
- src/osaf/saflog/saflog.h
+ src/osaf/saflog/saflog.h \
+ src/osaf/consensus/keyvalue.h \
+ src/osaf/consensus/service.h
pkglib_LTLIBRARIES += lib/libosaf_common.la
@@ -33,7 +35,9 @@ lib_libosaf_common_la_LDFLAGS = \
lib_libosaf_common_la_SOURCES = \
src/osaf/immutil/immutil.c \
- src/osaf/saflog/saflog.c
+ src/osaf/saflog/saflog.c \
+ src/osaf/consensus/keyvalue.cc \
+ src/osaf/consensus/service.cc
nodist_EXTRA_lib_libosaf_common_la_SOURCES = dummy.cc
diff --git a/src/osaf/consensus/Makefile b/src/osaf/consensus/Makefile
new file mode 100644
index 000000000..a2c8bc9dd
--- /dev/null
+++ b/src/osaf/consensus/Makefile
@@ -0,0 +1,18 @@
+# -*- OpenSAF -*-
+#
+# (C) Copyright 2018 The OpenSAF Foundation
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. This file and program are licensed
+# under the GNU Lesser General Public License Version 2.1, February 1999.
+# The complete license can be accessed from the following location:
+# http://opensource.org/licenses/lgpl-license.php
+# See the Copying file included with the OpenSAF distribution for full
+# licensing terms.
+#
+# Author(s): Ericsson AB
+#
+
+all:
+ $(MAKE) -C ../.. lib/libconsensus.la
diff --git a/src/osaf/consensus/keyvalue.cc b/src/osaf/consensus/keyvalue.cc
new file mode 100644
index 000000000..e5a796d33
--- /dev/null
+++ b/src/osaf/consensus/keyvalue.cc
@@ -0,0 +1,165 @@
+/* -*- OpenSAF -*-
+ *
+ * (C) Copyright 2018 The OpenSAF Foundation
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. This file and program are licensed
+ * under the GNU Lesser General Public License Version 2.1, February 1999.
+ * The complete license can be accessed from the following location:
+ * http://opensource.org/licenses/lgpl-license.php
+ * See the Copying file included with the OpenSAF distribution for full
+ * licensing terms.
+ *
+ * Author(s): Ericsson AB
+ *
+ */
+#include "keyvalue.h"
+#include "base/logtrace.h"
+#include "base/getenv.h"
+#include "base/conf.h"
+
+int KeyValue::Execute(const std::string& command, std::string& output) {
+ TRACE_ENTER();
+ constexpr size_t buf_size = 128;
+ std::array<char, buf_size> buffer;
+ FILE* pipe = popen(command.c_str(), "r");
+ if (pipe == nullptr) {
+ return 1;
+ }
+ output = "";
+ while (feof(pipe) == 0) {
+ if (fgets(buffer.data(), buf_size, pipe) != nullptr) {
+ output += buffer.data();
+ }
+ }
+ const int exit_code = pclose(pipe);
+ if (output.empty() == false && isspace(output.back()) != 0) {
+ // remove newline at end of output
+ output.pop_back();
+ }
+ TRACE("Executed '%s', returning %d", command.c_str(), exit_code);
+ return exit_code;
+}
+
+SaAisErrorT KeyValue::Get(const std::string& key, std::string& value) {
+ TRACE_ENTER();
+
+ const std::string kv_store_cmd = base::GetEnv("KEYVALUE_STORE_PLUGIN_CMD",
"");
+ const std::string command(kv_store_cmd + " get " + key);
+ int rc = KeyValue::Execute(command, value);
+ TRACE("Read '%s'", value.c_str());
+
+ if (rc == 0) {
+ return SA_AIS_OK;
+ } else {
+ return SA_AIS_ERR_FAILED_OPERATION;
+ }
+}
+
+SaAisErrorT KeyValue::Set(const std::string& key, const std::string& value) {
+ TRACE_ENTER();
+
+ const std::string kv_store_cmd = base::GetEnv("KEYVALUE_STORE_PLUGIN_CMD",
"");
+ const std::string command(kv_store_cmd + " set " + key + " " + value);
+ std::string output;
+ int rc = KeyValue::Execute(command, output);
+
+ if (rc == 0) {
+ return SA_AIS_OK;
+ } else {
+ return SA_AIS_ERR_FAILED_OPERATION;
+ }
+}
+
+SaAisErrorT KeyValue::Erase(const std::string& key) {
+ TRACE_ENTER();
+
+ const std::string kv_store_cmd = base::GetEnv("KEYVALUE_STORE_PLUGIN_CMD",
"");
+ const std::string command(kv_store_cmd + " erase " + key);
+ std::string output;
+ int rc = KeyValue::Execute(command, output);
+
+ if (rc == 0) {
+ return SA_AIS_OK;
+ } else {
+ return SA_AIS_ERR_FAILED_OPERATION;
+ }
+}
+
+SaAisErrorT KeyValue::Lock(const std::string& owner,
+ const unsigned int timeout) {
+ TRACE_ENTER();
+
+ const std::string kv_store_cmd = base::GetEnv("KEYVALUE_STORE_PLUGIN_CMD",
"");
+ const std::string command(kv_store_cmd + " lock " + owner + " " +
+ std::to_string(timeout));
+ std::string output;
+ int rc = KeyValue::Execute(command, output);
+
+ if (rc == 0) {
+ return SA_AIS_OK;
+ } else {
+ return SA_AIS_ERR_FAILED_OPERATION;
+ }
+}
+
+SaAisErrorT KeyValue::Unlock() {
+ TRACE_ENTER();
+
+ const std::string kv_store_cmd = base::GetEnv("KEYVALUE_STORE_PLUGIN_CMD",
"");
+ const std::string command(kv_store_cmd + " unlock");
+ std::string output;
+ int rc = Execute(command, output);
+
+ if (rc == 0) {
+ return SA_AIS_OK;
+ } else {
+ return SA_AIS_ERR_FAILED_OPERATION;
+ }
+}
+
+bool KeyValue::IsLockedByThisNode() {
+ TRACE_ENTER();
+
+ const std::string kv_store_cmd = base::GetEnv("KEYVALUE_STORE_PLUGIN_CMD",
"");
+ const std::string command(kv_store_cmd + " lock_owner");
+ std::string output;
+ int rc = KeyValue::Execute(command, output);
+
+ if (rc == 0) {
+ TRACE("Lock owner is %s", output.c_str());
+ if (output.compare(base::Conf::NodeName()) == 0) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void threadFunction(const std::string& key,
+ const ConsensusCallback& callback,
+ const uint32_t user_defined) {
+ TRACE_ENTER();
+
+ const std::string kv_store_cmd = base::GetEnv("KEYVALUE_STORE_PLUGIN_CMD",
"");
+ const std::string command(kv_store_cmd + " watch " + key);
+ std::string value;
+ int rc = KeyValue::Execute(command, value);
+ TRACE("Read '%s'", value.c_str());
+
+ if (rc == 0) {
+ callback(value, user_defined);
+ } else {
+ LOG_ER("Failed to watch %s", key.c_str());
+ }
+}
+
+void KeyValue::Watch(const std::string& key,
+ const ConsensusCallback callback,
+ const uint32_t user_defined)
+{
+ std::thread t(threadFunction, key, callback, user_defined);
+ t.detach();
+ return;
+}
diff --git a/src/osaf/consensus/keyvalue.h b/src/osaf/consensus/keyvalue.h
new file mode 100644
index 000000000..347c820d0
--- /dev/null
+++ b/src/osaf/consensus/keyvalue.h
@@ -0,0 +1,57 @@
+/* -*- OpenSAF -*-
+ *
+ * (C) Copyright 2018 The OpenSAF Foundation
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. This file and program are licensed
+ * under the GNU Lesser General Public License Version 2.1, February 1999.
+ * The complete license can be accessed from the following location:
+ * http://opensource.org/licenses/lgpl-license.php
+ * See the Copying file included with the OpenSAF distribution for full
+ * licensing terms.
+ *
+ * Author(s): Ericsson AB
+ *
+ */
+#ifndef CONSENSUS_KEYVALUE_H_
+#define CONSENSUS_KEYVALUE_H_
+
+#include <saAis.h>
+#include <string>
+#include <functional>
+#include <thread>
+
+typedef std::function<void(const std::string& new_value,
+ const uint32_t user_defined)> ConsensusCallback;
+
+class KeyValue {
+ public:
+ // Retrieve value of key
+ static SaAisErrorT Get(const std::string& key, std::string& value);
+
+ // Set key to value
+ static SaAisErrorT Set(const std::string& key, const std::string& value);
+
+ // Erase key
+ static SaAisErrorT Erase(const std::string& key);
+
+ // Obtain lock, default timeout is 20 seconds
+ static SaAisErrorT Lock(const std::string& owner,
+ const unsigned int timeout = 20);
+
+ // Release lock
+ static SaAisErrorT Unlock();
+
+ // Is locked by this node?
+ static bool IsLockedByThisNode();
+
+ // starts a thread to watch key and call callback if values changes
+ static void Watch(const std::string& key, ConsensusCallback callback,
+ const uint32_t user_defined);
+
+ // internal use
+ static int Execute(const std::string& command, std::string& output);
+};
+
+#endif
diff --git a/src/osaf/consensus/plugins/etcd.plugin
b/src/osaf/consensus/plugins/etcd.plugin
new file mode 100644
index 000000000..0b8c77b4e
--- /dev/null
+++ b/src/osaf/consensus/plugins/etcd.plugin
@@ -0,0 +1,217 @@
+#!/usr/bin/env bash
+
+readonly keyname="opensaf_consensus_lock"
+
+# get
+# retrieve <value> of <key> from key-value store
+# params:
+# $1 - <key>
+# returns:
+# 0 - success, <value> is echoed to stdout
+# non-zero - failure
+get() {
+ local readonly key=$1
+
+ value=`etcdctl get $key 2>&1`
+ if [ $? -eq 0 ]; then
+ echo "$value"
+ return 0
+ else
+ return 1
+ fi
+}
+
+# set
+# set <key> to <value> in key-value store
+# params:
+# $1 - <key>
+# $2 - <value>
+# returns:
+# 0 - success
+# non-zero - failure
+set() {
+ local readonly key=$1
+ local readonly value=$2
+
+ etcdctl set $key $value 1>& /dev/null
+ if [ $? -eq 0 ]; then
+ return 0
+ else
+ return 1
+ fi
+}
+
+# erase
+# erase <key> in key-value store
+# params:
+# $1 - <key>
+# returns:
+# 0 - success
+# non-zero - failure
+erase() {
+ local readonly key=$1
+
+ etcdctl rm $key 1>& /dev/null
+ if [ $? -eq 0 ]; then
+ return 0
+ else
+ return 1
+ fi
+}
+
+# lock
+# params:
+# $1 - <owner>, owner of the lock is set to this
+# $2 - <timeout>, will automatically unlock after <timeout> seconds
+# returns:
+# 0 - success
+# non-zero - failure
+lock() {
+ local readonly owner=$1
+ local readonly timeout=$2
+
+ #implementation here
+ etcdctl mk $keyname $owner --ttl $timeout >& /dev/null
+ if [ $? -ne 0 ]; then
+ current_owner=`etcdctl get $keyname`
+ # see if we already hold the lock
+ if [ "$current_owner" == "$owner" ]; then
+ return 0
+ fi
+ return 1
+ else
+ return 0
+ fi
+}
+
+# get
+# retrieve <owner> of lock
+# params:
+# none
+# returns:
+# 0 - success, <owner> is echoed to stdout
+# non-zero - failure or not locked
+lock_owner() {
+ get $keyname
+ return $?
+}
+
+# unlock
+# params:
+# $1 - <forced>
+# - (optional parameter)
+# - if set 'true', will unlock even if lock is not held by node
+# - defaults to 'false'
+# returns:
+# 0 - success
+# non-zero - failure
+#
+unlock() {
+ local readonly forced=${1:-false}
+ local readonly hostname=$(hostname)
+
+ if [ "$forced" = false ]; then
+ # check we own the lock
+ owner=`etcdctl get $keyname 2>&1`
+ if [ "$owner" != "$hostname" ]; then
+ return 1
+ fi
+ fi
+
+ etcdctl rm $keyname >& /dev/null
+ if [ $? -eq 0 ]; then
+ return 0
+ else
+ return 1
+ fi
+}
+
+# watch
+# watch <key> in key-value store
+# params:
+# $1 - <key>
+# returns:
+# 0 - success, <new_value> is echoed to stdout
+# non-zero - failure
+watch() {
+ local readonly key=$1
+
+ value=`etcdctl watch $key 2>&1`
+ if [ $? -eq 0 ]; then
+ # if the key is removed, then "PrevNode.Value: <value>" is returned
+ echo "$value"
+ return 0
+ else
+ return 1
+ fi
+}
+
+
+# argument parsing
+case "$1" in
+ get)
+ if [ "$#" -ne 2 ]; then
+ echo "Usage: $0 get <key>"
+ exit 1
+ fi
+ get $2
+ exit $?
+ ;;
+ set)
+ if [ "$#" -ne 3 ]; then
+ echo "Usage: $0 set <key> <value>"
+ exit 1
+ fi
+ set $2 $3
+ exit $?
+ ;;
+ erase)
+ if [ "$#" -ne 2 ]; then
+ echo "Usage: $0 erase <key>"
+ exit 1
+ fi
+ erase $2 ""
+ exit $?
+ ;;
+ lock)
+ if [ "$#" -ne 3 ]; then
+ echo "Usage: $0 lock <owner> <timeout>"
+ exit 1
+ fi
+ lock $2 $3
+ exit $?
+ ;;
+ lock_owner)
+ if [ "$#" -ne 1 ]; then
+ echo "Usage: $0 lock_owner"
+ exit 1
+ fi
+ lock_owner
+ exit $?
+ ;;
+ unlock)
+ if [ "$#" -eq 1 ]; then
+ unlock
+ exit $?
+ elif [ "$#" -eq 2 ] && [ "$2" == "--force" ]; then
+ unlock 1
+ exit $?
+ else
+ echo "Usage: $0 unlock [--force]"
+ exit 1
+ fi
+ ;;
+ watch)
+ if [ "$#" -ne 2 ]; then
+ echo "Usage: $0 watch <key>"
+ exit 1
+ fi
+ watch $2
+ exit $?
+ ;;
+ *)
+ echo $"Usage: $0 {get|set|erase|lock|unlock|lock_owner|watch}"
+ ;;
+esac
+
+exit 1
diff --git a/src/osaf/consensus/plugins/sample.plugin
b/src/osaf/consensus/plugins/sample.plugin
new file mode 100644
index 000000000..424982448
--- /dev/null
+++ b/src/osaf/consensus/plugins/sample.plugin
@@ -0,0 +1,162 @@
+#!/usr/bin/env bash
+
+readonly keyname="opensaf_consensus_lock"
+
+# get
+# retrieve <value> of <key> from key-value store
+# params:
+# $1 - <key>
+# returns:
+# 0 - success, <value> is echoed to stdout
+# non-zero - failure
+get() {
+ local readonly key=$1
+ ...
+}
+
+# set
+# set <key> to <value> in key-value store
+# params:
+# $1 - <key>
+# $2 - <value>
+# returns:
+# 0 - success
+# non-zero - failure
+set() {
+ local readonly key=$1
+ local readonly value=$2
+ ...
+}
+
+# erase
+# erase <key> in key-value store
+# params:
+# $1 - <key>
+# returns:
+# 0 - success
+# non-zero - failure
+erase() {
+ local readonly key=$1
+ ...
+}
+
+# lock
+# params:
+# $1 - <owner>, owner of the lock is set to this
+# $2 - <timeout>, will automatically unlock after <timeout> seconds
+# returns:
+# 0 - success
+# non-zero - failure
+lock() {
+ local readonly owner=$1
+ local readonly timeout=$2
+ ...
+}
+
+# get
+# retrieve <owner> of lock
+# params:
+# none
+# returns:
+# 0 - success, <owner> is echoed to stdout
+# non-zero - failure or not locked
+lock_owner() {
+ ...
+}
+
+# unlock
+# params:
+# $1 - <forced>
+# - (optional parameter)
+# - if set 'true', will unlock even if lock is not held by node
+# - defaults to 'false'
+# returns:
+# 0 - success
+# non-zero - failure
+#
+unlock() {
+ local readonly forced=${1:-false}
+ local readonly hostname=$(hostname)
+ ...
+}
+
+# watch
+# watch <key> in key-value store
+# params:
+# $1 - <key>
+# returns:
+# 0 - success, <new_value> is echoed to stdout
+# non-zero - failure
+watch() {
+ local readonly key=$1
+ ..
+}
+
+# argument parsing
+case "$1" in
+ get)
+ if [ "$#" -ne 2 ]; then
+ echo "Usage: $0 get <key>"
+ exit 1
+ fi
+ get $2
+ exit $?
+ ;;
+ set)
+ if [ "$#" -ne 3 ]; then
+ echo "Usage: $0 set <key> <value>"
+ exit 1
+ fi
+ set $2 $3
+ exit $?
+ ;;
+ erase)
+ if [ "$#" -ne 2 ]; then
+ echo "Usage: $0 erase <key>"
+ exit 1
+ fi
+ erase $2 ""
+ exit $?
+ ;;
+ lock)
+ if [ "$#" -ne 3 ]; then
+ echo "Usage: $0 lock <owner> <timeout>"
+ exit 1
+ fi
+ lock $2 $3
+ exit $?
+ ;;
+ lock_owner)
+ if [ "$#" -ne 1 ]; then
+ echo "Usage: $0 lock_owner"
+ exit 1
+ fi
+ lock_owner
+ exit $?
+ ;;
+ unlock)
+ if [ "$#" -eq 1 ]; then
+ unlock
+ exit $?
+ elif [ "$#" -eq 2 ] && [ "$2" == "--force" ]; then
+ unlock 1
+ exit $?
+ else
+ echo "Usage: $0 unlock [--force]"
+ exit 1
+ fi
+ ;;
+ watch)
+ if [ "$#" -ne 2 ]; then
+ echo "Usage: $0 watch <key>"
+ exit 1
+ fi
+ watch $2
+ exit $?
+ ;;
+ *)
+ echo $"Usage: $0 {get|set|erase|lock|unlock|lock_owner|watch}"
+ ;;
+esac
+
+exit 1
diff --git a/src/osaf/consensus/service.cc b/src/osaf/consensus/service.cc
new file mode 100644
index 000000000..fd525b6d3
--- /dev/null
+++ b/src/osaf/consensus/service.cc
@@ -0,0 +1,231 @@
+/* -*- OpenSAF -*-
+ *
+ * (C) Copyright 2018 The OpenSAF Foundation
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. This file and program are licensed
+ * under the GNU Lesser General Public License Version 2.1, February 1999.
+ * The complete license can be accessed from the following location:
+ * http://opensource.org/licenses/lgpl-license.php
+ * See the Copying file included with the OpenSAF distribution for full
+ * licensing terms.
+ *
+ * Author(s): Ericsson AB
+ *
+ */
+#include "service.h"
+#include "base/logtrace.h"
+#include "base/conf.h"
+#include "base/getenv.h"
+#include "base/ncssysf_def.h"
+#include <unistd.h>
+#include <climits>
+
+SaAisErrorT Consensus::BeginActivePromotion() {
+ TRACE_ENTER();
+ SaAisErrorT rc;
+
+ if (use_consensus_ == false) {
+ return SA_AIS_OK;
+ }
+
+ rc = KeyValue::Lock(base::Conf::NodeName(), 30);
+ while (rc != SA_AIS_OK) {
+ TRACE("Waiting for lock");
+ usleep(sleep_internal);
+ rc = KeyValue::Lock(base::Conf::NodeName(), 30);
+ }
+
+ LOG_IN("Node %s obtained lock", base::Conf::NodeName().c_str());
+
+ // check current active node
+ std::string current;
+ rc = KeyValue::Get(keyname, current);
+ if (rc == SA_AIS_OK) {
+ LOG_NO("Current active controller is %s", current.c_str());
+ if (current != base::Conf::NodeName()) {
+ FenceNode(current);
+ }
+ }
+ LOG_NO("Setting active controller to %s", base::Conf::NodeName().c_str());
+ rc = KeyValue::Set(keyname, base::Conf::NodeName());
+ return rc;
+}
+
+SaAisErrorT Consensus::EndActivePromotion() {
+ TRACE_ENTER();
+ if (use_consensus_ == false) {
+ return SA_AIS_OK;
+ }
+
+ bool locked = KeyValue::IsLockedByThisNode();
+ if (locked == false) {
+ LOG_ER("Lock unexpectedly released");
+ } else {
+ SaAisErrorT rc;
+ rc = KeyValue::Unlock();
+ while (rc != SA_AIS_OK) {
+ LOG_IN("Trying to unlock");
+ usleep(sleep_internal);
+ rc = KeyValue::Unlock();
+ }
+ LOG_IN("Released lock");
+ }
+ return SA_AIS_OK;
+}
+
+SaAisErrorT Consensus::Demote(const std::string node = "")
+{
+ TRACE_ENTER();
+ SaAisErrorT rc;
+ if (use_consensus_ == false) {
+ return SA_AIS_OK;
+ }
+
+ rc = KeyValue::Lock(base::Conf::NodeName(), 30);
+ while (rc != SA_AIS_OK) {
+ LOG_IN("Waiting for lock");
+ usleep(sleep_internal);
+ rc = KeyValue::Lock(base::Conf::NodeName(), 30);
+ }
+
+ // check current active node
+ std::string current;
+ rc = KeyValue::Get(keyname, current);
+ if (rc == SA_AIS_OK) {
+ LOG_NO("Demoting %s as active controller", current.c_str());
+ if (node.empty() == false && node != current) {
+ FenceNode(node);
+ }
+ // if Demote() was called as DemoteCurrentActive() from fmd,
+ // then fmd will fence the node itself
+
+ rc = KeyValue::Erase(keyname);
+ if (rc != SA_AIS_OK) {
+ LOG_ER("Failed to clear active controller in KeyValue");
+ }
+ LOG_NO("Node %s demoted", current.c_str());
+ }
+
+ rc = KeyValue::Unlock();
+ while (rc != SA_AIS_OK) {
+ LOG_IN("Trying to unlock");
+ usleep(sleep_internal);
+ rc = KeyValue::Unlock();
+ }
+ LOG_IN("Released lock");
+
+ return SA_AIS_OK;
+}
+
+SaAisErrorT Consensus::DemoteCurrentActive() {
+ TRACE_ENTER();
+ return Demote();
+}
+
+SaAisErrorT Consensus::DemoteThisNode() {
+ TRACE_ENTER();
+ return Demote(base::Conf::NodeName());
+}
+
+bool Consensus::IsEnabled() const {
+ return use_consensus_;
+}
+
+std::string Consensus::CurrentActive() const {
+ TRACE_ENTER();
+ SaAisErrorT rc;
+ bool success = false;
+ if (use_consensus_ == false) {
+ return "";
+ }
+
+ rc = KeyValue::Lock(base::Conf::NodeName(), 30);
+ while (rc != SA_AIS_OK) {
+ LOG_IN("Waiting for lock");
+ usleep(sleep_internal);
+ rc = KeyValue::Lock(base::Conf::NodeName(), 30);
+ }
+
+ // check current active node
+ std::string current;
+ rc = KeyValue::Get(keyname, current);
+ if (rc == SA_AIS_OK) {
+ success = true;
+ LOG_IN("Current active controller is %s", current.c_str());
+ }
+
+ rc = KeyValue::Unlock();
+ while (rc != SA_AIS_OK) {
+ LOG_IN("Trying to unlock");
+ usleep(sleep_internal);
+ rc = KeyValue::Unlock();
+ }
+
+ LOG_IN("Released lock");
+
+ if (success == true) {
+ return current;
+ } else {
+ return "";
+ }
+}
+
+Consensus::Consensus() {
+ TRACE_ENTER();
+
+ uint32_t split_brain_enable = base::GetEnv("SPLIT_BRAIN_PREVENTION", 0);
+ std::string kv_store_cmd = base::GetEnv("KEYVALUE_STORE_PLUGIN_CMD", "");
+ uint32_t use_remote_fencing = base::GetEnv("FMS_USE_REMOTE_FENCING" , 0);
+
+ if (split_brain_enable == 1 && kv_store_cmd.empty() == false) {
+ use_consensus_ = true;
+ } else {
+ use_consensus_ = false;
+ }
+
+ if (use_remote_fencing == 1) {
+ use_remote_fencing_ = true;
+ }
+
+ // needed for base::Conf::NodeName() later
+ base::Conf::InitNodeName();
+
+ if (use_consensus_ == true) {
+ LOG_NO("Split brain prevention is enabled");
+ } else {
+ LOG_NO("Split brain prevention is disabled");
+ }
+}
+
+Consensus::~Consensus()
+{
+}
+
+bool Consensus::FenceNode(const std::string& node)
+{
+ if (use_remote_fencing_ == true) {
+ LOG_WA("Fencing remote node %s", node.c_str());
+ // @todo currently passing UINT_MAX as node ID, since
+ // we can't always obtain a valid node ID?
+ opensaf_reboot(UINT_MAX, node.c_str(),
+ "Fencing remote node");
+
+ return true;
+ } else {
+ LOG_WA("Fencing is not enabled. Node %s will not be fenced", node.c_str());
+ return false;
+ }
+}
+
+void Consensus::MonitorActive(ConsensusCallback callback,
+ const uint32_t user_defined)
+{
+ TRACE_ENTER();
+ if (use_consensus_ == false) {
+ return;
+ }
+
+ KeyValue::Watch(keyname, callback, user_defined);
+}
diff --git a/src/osaf/consensus/service.h b/src/osaf/consensus/service.h
new file mode 100644
index 000000000..03f7f26f9
--- /dev/null
+++ b/src/osaf/consensus/service.h
@@ -0,0 +1,66 @@
+/* -*- OpenSAF -*-
+ *
+ * (C) Copyright 2018 The OpenSAF Foundation
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. This file and program are licensed
+ * under the GNU Lesser General Public License Version 2.1, February 1999.
+ * The complete license can be accessed from the following location:
+ * http://opensource.org/licenses/lgpl-license.php
+ * See the Copying file included with the OpenSAF distribution for full
+ * licensing terms.
+ *
+ * Author(s): Ericsson AB
+ *
+ */
+#ifndef CONSENSUS_SERVICE_H_
+#define CONSENSUS_SERVICE_H_
+
+#include "keyvalue.h"
+#include "saAis.h"
+#include <string>
+
+class Consensus {
+public:
+
+ // Obtain lock, set active controller to this node
+ SaAisErrorT BeginActivePromotion();
+
+ // Release lock
+ SaAisErrorT EndActivePromotion();
+
+ // Obtain lock, clear current active controller, release lock
+ SaAisErrorT DemoteCurrentActive();
+
+ // Obtain lock, clear this node as active controller, release lock
+ SaAisErrorT DemoteThisNode();
+
+ // Returns active controller as known by the consensus service
+ std::string CurrentActive() const;
+
+ // If the active controller is changed as known by the consensus service,
+ // then callback will be run from a new thread, with <user_defined> returned
+ // in the callback
+ void MonitorActive(ConsensusCallback callback, const uint32_t user_defined);
+
+ // Is consensus service enabled?
+ bool IsEnabled() const;
+
+ explicit Consensus();
+ virtual ~Consensus();
+
+ Consensus(const Consensus&) = delete;
+ Consensus& operator=(const Consensus&) = delete;
+
+private:
+ bool use_consensus_ = false;
+ bool use_remote_fencing_ = false;
+ const std::string keyname = "opensaf_active_controller";
+ static constexpr int sleep_internal = 100000; // in us
+
+ SaAisErrorT Demote(const std::string node);
+ bool FenceNode(const std::string& node);
+};
+
+#endif
diff --git a/src/rde/Makefile.am b/src/rde/Makefile.am
index c967f9fc4..182f347ab 100644
--- a/src/rde/Makefile.am
+++ b/src/rde/Makefile.am
@@ -58,7 +58,8 @@ bin_osafrded_SOURCES = \
bin_osafrded_LDADD = \
lib/libSaAmf.la \
- lib/libopensaf_core.la
+ lib/libopensaf_core.la \
+ lib/libosaf_common.la
bin_rdegetrole_CPPFLAGS = \
$(AM_CPPFLAGS)
diff --git a/src/rde/rded/osaf-rded.in b/src/rde/rded/osaf-rded.in
index 1c1786c8d..1697936a7 100644
--- a/src/rde/rded/osaf-rded.in
+++ b/src/rde/rded/osaf-rded.in
@@ -28,6 +28,10 @@ else
. $pkgsysconfdir/rde.conf
fi
+if [ -f "$pkgsysconfdir/fmd.conf" ]; then
+ . "$pkgsysconfdir/fmd.conf"
+fi
+
binary=$pkglibdir/$osafprog
pidfile=$pkgpiddir/$osafprog.pid
tracefile=$pkglogdir/$osafprog.log
diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h
index d2a3d46b2..83f35c691 100644
--- a/src/rde/rded/rde_cb.h
+++ b/src/rde/rded/rde_cb.h
@@ -45,7 +45,8 @@ enum RDE_MSG_TYPE {
RDE_MSG_PEER_UP = 1,
RDE_MSG_PEER_DOWN = 2,
RDE_MSG_PEER_INFO_REQ = 3,
- RDE_MSG_PEER_INFO_RESP = 4
+ RDE_MSG_PEER_INFO_RESP = 4,
+ RDE_MSG_NEW_ACTIVE_CALLBACK = 5
};
struct rde_peer_info {
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index 0298bf3ff..23c03f552 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -28,6 +28,7 @@
#include <cerrno>
#include <cstdlib>
#include <cstring>
+#include "osaf/consensus/service.h"
#include "base/daemon.h"
#include "base/logtrace.h"
#include "base/osaf_poll.h"
@@ -37,6 +38,7 @@
#include <saAmf.h>
#include "rde/rded/rde_cb.h"
#include "rde/rded/role.h"
+#include "base/conf.h"
#define RDA_MAX_CLIENTS 32
@@ -92,10 +94,6 @@ static void handle_mbx_event() {
TRACE_ENTER();
msg = reinterpret_cast<rde_msg *>(ncs_ipc_non_blk_recv(&rde_cb->mbx));
- TRACE("Received %s from node 0x%x with state %s. My state is %s",
- rde_msg_name[msg->type], msg->fr_node_id,
- Role::to_string(msg->info.peer_info.ha_role),
- Role::to_string(role->role()));
switch (msg->type) {
case RDE_MSG_PEER_INFO_REQ:
@@ -118,6 +116,29 @@ static void handle_mbx_event() {
case RDE_MSG_PEER_DOWN:
LOG_NO("Peer down on node 0x%x", msg->fr_node_id);
break;
+ case RDE_MSG_NEW_ACTIVE_CALLBACK:
+ {
+ const std::string my_node = base::Conf::NodeName();
+
+ // get current active controller
+ Consensus consensus_service;
+ std::string active_controller = consensus_service.CurrentActive();
+
+ LOG_NO("New active controller notification from consensus service");
+
+ if (role->role() == PCS_RDA_ACTIVE) {
+ if (my_node.compare(active_controller) != 0) {
+ // we are meant to be active, but consensus service doesn't think
so
+ LOG_ER("Role does not match consensus service");
+ LOG_ER("Probable split brain. Rebooting this node");
+ opensaf_reboot(0, nullptr, "Split-brain detected by consensus
service");
+ } else {
+ // get more callbacks
+ consensus_service.MonitorActive(Role::MonitorCallback,
rde_cb->mbx);
+ }
+ }
+ }
+ break;
default:
LOG_ER("%s: discarding unknown message type %u", __FUNCTION__,
msg->type);
break;
@@ -205,11 +226,12 @@ int main(int argc, char *argv[]) {
NCS_SEL_OBJ mbx_sel_obj;
RDE_RDA_CB *rde_rda_cb = &rde_cb->rde_rda_cb;
int term_fd;
-
opensaf_reboot_prepare();
daemonize(argc, argv);
+ base::Conf::InitNodeName();
+
if (initialize_rde() != NCSCC_RC_SUCCESS) goto init_failed;
mbx_sel_obj = ncs_ipc_get_sel_obj(&rde_cb->mbx);
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index f7511f0d8..28e034c44 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -21,6 +21,7 @@
#include <cinttypes>
#include "rde/rded/role.h"
+#include "rde/rded/rde_cb.h"
#include <cstdint>
#include "base/logtrace.h"
#include "base/getenv.h"
@@ -28,6 +29,7 @@
#include "base/time.h"
#include "base/ncs_main_papi.h"
#include "rde/rded/rde_cb.h"
+#include "osaf/consensus/service.h"
const char* const Role::role_names_[] = {"Undefined", "ACTIVE", "STANDBY",
"QUIESCED", "QUIESCING", "Invalid"};
@@ -42,6 +44,23 @@ const char* Role::to_string(PCS_RDA_ROLE role) {
: role_names_[0];
}
+void Role::MonitorCallback(const std::string& new_value, SYSF_MBX mbx)
+{
+ TRACE_ENTER();
+
+ rde_msg* msg = static_cast<rde_msg *>(malloc(sizeof(rde_msg)));
+ msg->type = RDE_MSG_NEW_ACTIVE_CALLBACK;
+
+ // wait a few seconds before sending msg to allow role change callbacks
+ // (from AMF) to be processed first
+ sleep(3);
+
+ uint32_t status;
+ status = m_NCS_IPC_SEND(&mbx,
+ msg, NCS_IPC_PRIORITY_NORMAL);
+ osafassert(status == NCSCC_RC_SUCCESS);
+}
+
Role::Role(NODE_ID own_node_id)
: known_nodes_{},
role_{PCS_RDA_QUIESCED},
@@ -61,10 +80,27 @@ timespec* Role::Poll(timespec* ts) {
*ts = election_end_time_ - now;
timeout = ts;
} else {
+ SaAisErrorT rc;
+ Consensus consensus_service;
+ rc = consensus_service.BeginActivePromotion();
+ if (rc != SA_AIS_OK) {
+ LOG_ER("Unable to set active controller in consensus service");
+ LOG_ER("Split brain is possible");
+ }
+
ExecutePreActiveScript();
LOG_NO("Switched to ACTIVE from %s", to_string(role()));
role_ = PCS_RDA_ACTIVE;
rde_rda_send_role(role_);
+
+ rc = consensus_service.EndActivePromotion();
+ if (rc != SA_AIS_OK) {
+ LOG_ER("Unable to remove lock in consensus service");
+ }
+
+ // get callback if active controller is changed in consensus service
+ RDE_CONTROL_BLOCK* cb = rde_get_control_block();
+ consensus_service.MonitorActive(MonitorCallback, cb->mbx);
}
}
return timeout;
@@ -91,7 +127,14 @@ uint32_t Role::SetRole(PCS_RDA_ROLE new_role) {
}
if (new_role != old_role) {
LOG_NO("RDE role set to %s", to_string(new_role));
- if (new_role == PCS_RDA_ACTIVE) ExecutePreActiveScript();
+ if (new_role == PCS_RDA_ACTIVE) {
+ ExecutePreActiveScript();
+
+ // get callback if active controller is changed in consensus service
+ Consensus consensus_service;
+ RDE_CONTROL_BLOCK* cb = rde_get_control_block();
+ consensus_service.MonitorActive(MonitorCallback, cb->mbx);
+ }
role_ = new_role;
if (new_role == PCS_RDA_UNDEFINED) {
known_nodes_.clear();
diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h
index 20219b535..6d3c77d2f 100644
--- a/src/rde/rded/role.h
+++ b/src/rde/rded/role.h
@@ -24,6 +24,7 @@
#include "base/macros.h"
#include "mds/mds_papi.h"
#include "rde/agent/rda_papi.h"
+#include <string>
namespace base {
class Process;
@@ -38,6 +39,7 @@ class Role {
uint32_t SetRole(PCS_RDA_ROLE new_role);
PCS_RDA_ROLE role() const;
static const char* to_string(PCS_RDA_ROLE role);
+ static void MonitorCallback(const std::string& new_value, SYSF_MBX mbx);
private:
static const uint64_t kDefaultDiscoverPeerTimeout = 2000;
--
2.14.1
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel