osaf/services/infrastructure/fm/config/fmd.conf | 6 ++
osaf/services/infrastructure/fm/fms/fm.h | 4 +-
osaf/services/infrastructure/fm/fms/fm_amf.c | 10 ++++-
osaf/services/infrastructure/fm/fms/fm_cb.h | 3 +
osaf/services/infrastructure/fm/fms/fm_main.c | 51 +++++++++++++++++++++++-
5 files changed, 68 insertions(+), 6 deletions(-)
Add supervision of ACTIVE assignment from AMF after a role change to the ACTIVE
role. If no AMF assignment has been received within the time limit, the node
will be rebooted. This will enhance availability by detecting and mitigating
problems during the transition to the ACTIVE role.
diff --git a/osaf/services/infrastructure/fm/config/fmd.conf
b/osaf/services/infrastructure/fm/config/fmd.conf
--- a/osaf/services/infrastructure/fm/config/fmd.conf
+++ b/osaf/services/infrastructure/fm/config/fmd.conf
@@ -19,6 +19,12 @@ export FMS_HA_ENV_HEALTHCHECK_KEY="Defau
# Promote active timer
export FMS_PROMOTE_ACTIVE_TIMER=0
+# FM will supervise transitions to the ACTIVE role when this variable is set to
+# a non-zero value. The value is the time in the unit of 10 ms to wait for a
+# role change to ACTIVE to take effect. If AMF has not give FM an active
+# assignment within this time, the node will be rebooted.
+#export FMS_ACTIVATION_SUPERVISION_TIMER=1000
+
# Uncomment the next line to enable info level logging
#args="--loglevel=info"
diff --git a/osaf/services/infrastructure/fm/fms/fm.h
b/osaf/services/infrastructure/fm/fms/fm.h
--- a/osaf/services/infrastructure/fm/fms/fm.h
+++ b/osaf/services/infrastructure/fm/fms/fm.h
@@ -75,6 +75,6 @@
extern void amfnd_down_callback(void);
extern void ava_install_amf_down_cb(void (*cb) (void));
-extern uint32_t initialize_for_assignment(FM_CB *cb,
- SaAmfHAStateT
ha_state);
+extern uint32_t initialize_for_assignment(FM_CB *cb, SaAmfHAStateT ha_state);
+extern void fm_tmr_stop(FM_TMR *tmr);
#endif
diff --git a/osaf/services/infrastructure/fm/fms/fm_amf.c
b/osaf/services/infrastructure/fm/fms/fm_amf.c
--- a/osaf/services/infrastructure/fm/fms/fm_amf.c
+++ b/osaf/services/infrastructure/fm/fms/fm_amf.c
@@ -128,7 +128,8 @@ void fm_amf_give_hdl(void)
* Notes : None.
*****************************************************************************/
void fm_saf_CSI_set_callback(SaInvocationT invocation,
- const SaNameT *compName, SaAmfHAStateT
new_haState, SaAmfCSIDescriptorT csiDescriptor)
+ const SaNameT *compName, SaAmfHAStateT new_haState,
+ SaAmfCSIDescriptorT csiDescriptor)
{
FM_AMF_CB *fm_amf_cb;
SaAisErrorT error = SA_AIS_OK;
@@ -138,6 +139,13 @@ void fm_saf_CSI_set_callback(SaInvocatio
compName->value, ha_role_string[new_haState - 1]);
fm_amf_cb = fm_amf_take_hdl();
if (fm_amf_cb != NULL) {
+ if (new_haState == SA_AMF_HA_ACTIVE &&
+ fm_cb->activation_supervision_tmr.status ==
+ FM_TMR_RUNNING) {
+ fm_tmr_stop(&fm_cb->activation_supervision_tmr);
+ LOG_NO("Stopped activation supervision due to new AMF "
+ "state %u", (unsigned) new_haState);
+ }
if ((rc = initialize_for_assignment(fm_cb, new_haState)) !=
NCSCC_RC_SUCCESS) {
LOG_ER("initialize_for_assignment FAILED %u",
diff --git a/osaf/services/infrastructure/fm/fms/fm_cb.h
b/osaf/services/infrastructure/fm/fms/fm_cb.h
--- a/osaf/services/infrastructure/fm/fms/fm_cb.h
+++ b/osaf/services/infrastructure/fm/fms/fm_cb.h
@@ -32,6 +32,7 @@ uint32_t gl_fm_hdl;
typedef enum {
FM_TMR_TYPE_MIN,
FM_TMR_PROMOTE_ACTIVE,
+ FM_TMR_ACTIVATION_SUPERVISION,
FM_TMR_TYPE_MAX
} FM_TMR_TYPE;
@@ -76,9 +77,11 @@ typedef struct fm_cb {
/* Timers */
FM_TMR promote_active_tmr;
+ FM_TMR activation_supervision_tmr;
/* Time in terms of one hundredth of seconds (500 for 5 secs.) */
uint32_t active_promote_tmr_val;
+ uint32_t activation_supervision_tmr_val;
bool fully_initialized;
bool csi_assigned;
/* Variable to indicate OpenSAF control of TIPC transport */
diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c
b/osaf/services/infrastructure/fm/fms/fm_main.c
--- a/osaf/services/infrastructure/fm/fms/fm_main.c
+++ b/osaf/services/infrastructure/fm/fms/fm_main.c
@@ -389,12 +389,22 @@ static uint32_t fm_get_args(FM_CB *fm_cb
/* Update fm_cb configuration fields */
fm_cb->node_id = m_NCS_GET_NODE_ID;
- fm_cb->active_promote_tmr_val =
atoi(getenv("FMS_PROMOTE_ACTIVE_TIMER"));
+ fm_cb->active_promote_tmr_val =
+ atoi(getenv("FMS_PROMOTE_ACTIVE_TIMER"));
+ char* activation_supervision_tmr_val =
+ getenv("FMS_ACTIVATION_SUPERVISION_TIMER");
+ if (activation_supervision_tmr_val != NULL) {
+ fm_cb->activation_supervision_tmr_val =
+ atoi(activation_supervision_tmr_val);
+ } else {
+ fm_cb->activation_supervision_tmr_val = 1000;
+ }
/* Set timer variables */
fm_cb->promote_active_tmr.type = FM_TMR_PROMOTE_ACTIVE;
-
- TRACE_LEAVE();
+ fm_cb->activation_supervision_tmr.type = FM_TMR_ACTIVATION_SUPERVISION;
+
+ TRACE_LEAVE();
return NCSCC_RC_SUCCESS;
}
@@ -521,6 +531,10 @@ static void fm_mbx_msg_handler(FM_CB *fm
opensaf_reboot(fm_cb->peer_node_id, (char
*)fm_cb->peer_node_name.value,
"Received Node Down for Active peer");
fm_rda_set_role(fm_cb, PCS_RDA_ACTIVE);
+ } else if (fm_mbx_evt->info.fm_tmr->type ==
FM_TMR_ACTIVATION_SUPERVISION) {
+ opensaf_reboot(0, NULL, "Activation timer supervision "
+ "expired: no ACTIVE assignment received "
+ "within the time limit");
}
break;
case FM_EVT_RDA_ROLE:
@@ -544,6 +558,22 @@ static void fm_evt_proc_rda_callback(FM_
uint32_t rc = NCSCC_RC_SUCCESS;
TRACE_ENTER2("%d", (int) evt->info.rda_info.role);
+ if (evt->info.rda_info.role != PCS_RDA_ACTIVE &&
+ cb->promote_active_tmr.status == FM_TMR_RUNNING) {
+ fm_tmr_stop(&cb->activation_supervision_tmr);
+ LOG_NO("Stopped activation supervision due to new role %u",
+ (unsigned) evt->info.rda_info.role);
+ }
+ if (evt->info.rda_info.role == PCS_RDA_ACTIVE &&
+ cb->role != PCS_RDA_ACTIVE &&
+ cb->amf_state != SA_AMF_HA_ACTIVE &&
+ cb->activation_supervision_tmr_val != 0 &&
+ cb->promote_active_tmr.status != FM_TMR_RUNNING) {
+ LOG_NO("Starting activation supervision: %" PRIu64 "ms",
+ 10 * (uint64_t) cb->activation_supervision_tmr_val);
+ fm_tmr_start(&cb->activation_supervision_tmr,
+ cb->activation_supervision_tmr_val);
+ }
if ((rc = initialize_for_assignment(cb,
(SaAmfHAStateT) evt->info.rda_info.role)) != NCSCC_RC_SUCCESS) {
LOG_ER("initialize_for_assignment FAILED %u", (unsigned) rc);
@@ -590,6 +620,21 @@ uint32_t fm_tmr_start(FM_TMR *tmr, SaTim
return NCSCC_RC_SUCCESS;
}
+void fm_tmr_stop(FM_TMR *tmr)
+{
+ TRACE_ENTER();
+ if (tmr->tmr_id != NULL) {
+ if (tmr->status == FM_TMR_RUNNING) {
+ m_NCS_TMR_STOP(tmr->tmr_id);
+ }
+ m_NCS_TMR_DESTROY(tmr->tmr_id);
+ tmr->tmr_id = NULL;
+ }
+ tmr->status = FM_TMR_STOPPED;
+ TRACE_LEAVE();
+ return;
+}
+
/****************************************************************************
* Name : fm_tmr_exp
*
------------------------------------------------------------------------------
Site24x7 APM Insight: Get Deep Visibility into Application Performance
APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
Monitor end-to-end web transactions and take corrective actions now
Troubleshoot faster and improve end-user experience. Signup Now!
http://pubads.g.doubleclick.net/gampad/clk?id=272487151&iu=/4140
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel