osaf/services/saf/immsv/immd/immd_cb.h | 10 ++-
osaf/services/saf/immsv/immd/immd_evt.c | 1 +
osaf/services/saf/immsv/immd/immd_proc.c | 104 +++++++++++++++++++++++++++++-
osaf/services/saf/immsv/immd/immd_proc.h | 2 +
osaf/services/saf/immsv/immd/immd_sbevt.c | 24 ++++++
5 files changed, 138 insertions(+), 3 deletions(-)
Standby IMMD records IMMND down events for payloads including the
epoch it occurred in. The recorded payload down events are discarded
by standby IMMD when a new IMMND with same node-id is introduced.
At failover, the new active IMMD will generate a discard-node
event for any recorded payload down events that are still in the
same epoch. Normally these discard-node events will be redundant.
In rare cases they plug the hole that this ticket reported.
diff --git a/osaf/services/saf/immsv/immd/immd_cb.h
b/osaf/services/saf/immsv/immd/immd_cb.h
--- a/osaf/services/saf/immsv/immd/immd_cb.h
+++ b/osaf/services/saf/immsv/immd/immd_cb.h
@@ -53,13 +53,20 @@ typedef struct immd_immnd_info_node {
/*ABT below corresponds to old ImmEvs::NodeInfo */
int immnd_execPid;
- int epoch;
+ unsigned int epoch;
bool syncRequested;
bool isOnController;
bool isCoord;
bool syncStarted;
} IMMD_IMMND_INFO_NODE;
+typedef struct immd_immnd_detached_node { /* IMMD SBY tracking of departed
payload */
+ NODE_ID node_id;
+ int immnd_execPid;
+ unsigned int epoch;
+ struct immd_immnd_detached_node *next;
+} IMMD_IMMND_DETACHED_NODE;
+
typedef struct immd_cb_tag {
SYSF_MBX mbx;
SaNameT comp_name;
@@ -117,6 +124,7 @@ typedef struct immd_cb_tag {
IMMD_SAVED_FEVS_MSG *saved_msgs;
SaImmRepositoryInitModeT mRim; /* Should be the rim obtained from
coord. */
+ IMMD_IMMND_DETACHED_NODE *detached_nodes; /* IMMD SBY list of recently
departed payloads */
} IMMD_CB;
uint32_t immd_immnd_info_tree_init(IMMD_CB *cb);
diff --git a/osaf/services/saf/immsv/immd/immd_evt.c
b/osaf/services/saf/immsv/immd/immd_evt.c
--- a/osaf/services/saf/immsv/immd/immd_evt.c
+++ b/osaf/services/saf/immsv/immd/immd_evt.c
@@ -1978,6 +1978,7 @@ static uint32_t immd_evt_proc_lga_callba
/* Change of role to active => We may need to elect new coord */
immd_proc_elect_coord(cb, true);
immd_db_purge_fevs(cb);
+ immd_pending_payload_discards(cb); /*Ensure node down for
payloads.*/
}
done:
TRACE_LEAVE();
diff --git a/osaf/services/saf/immsv/immd/immd_proc.c
b/osaf/services/saf/immsv/immd/immd_proc.c
--- a/osaf/services/saf/immsv/immd/immd_proc.c
+++ b/osaf/services/saf/immsv/immd/immd_proc.c
@@ -439,17 +439,117 @@ uint32_t immd_process_immnd_down(IMMD_CB
free(tmpData);
}
+ } else if(!(immnd_info->isOnController)) {
+ /* Standby NOT immediately sending redundant D2ND_DISCARD_NODE
in this case.
+ But will record any payload down event in case the active SC
is included
+ in a burst of node downs. See ticket #563. The active IMMD
may be going
+ down together with many payload nodes, such that the active
IMMD never has
+ time to generate the discard node message for all payloads.
This will be
+ detected if this (standby) IMMD becomes active in close time
proximity.
+ See immd_pending_payload_discards() below.
+ */
+
+ LOG_IN("Standby IMMD recording IMMND DOWN for node %x",
immnd_info->immnd_key);
+ IMMD_IMMND_DETACHED_NODE *detached_node = calloc(1,
sizeof(IMMD_IMMND_DETACHED_NODE));
+ osafassert(detached_node);
+ detached_node->node_id = immnd_info->immnd_key;
+ detached_node->immnd_execPid = immnd_info->immnd_execPid;
+ detached_node->epoch = immnd_info->epoch;
+ detached_node->next = cb->detached_nodes;
+ cb->detached_nodes = detached_node;
}
/*We remove the node for the lost IMMND on both active and standby. */
- TRACE_5("Removing node key:%u dest:%u", immnd_info->immnd_key,
- m_NCS_NODE_ID_FROM_MDS_DEST(immnd_info->immnd_dest));
+ TRACE_5("Removing node id:%x", immnd_info->immnd_key);
immd_immnd_info_node_delete(cb, immnd_info);
immd_cb_dump();
TRACE_LEAVE();
return NCSCC_RC_SUCCESS;
}
+
+/****************************************************************************
+ * Name : immd_pending_payload_discards
+ *
+ * Description : Send possibly redundant discard-node message to IMMNDs for
+ * payload nodes (IMMNDs) that have departed and not returned.
+ * This is needed to plug the small hole that exists in the
+ * handling of IMMND node down, when the current active IMMD
+ * is being taken down concurrently with several payloads.
+ *
+ * The active IMMD may then be pulled down after having
+ * received the IMMND MDS down event for the payloads, but
+ * before having created or sent the fevs message broadcasting
+ * each node down to the IMMND cluster.
+ *
+ * The list of detached nodes is screened for having occcurred
+ * in the current epoch. This is an extra guard against the
new
+ * active shooting down a recently restarted payload. The list
+ * is also pruned in immd_sbevt.c when it receives info about
+ * a payload having re-joined.
+ *
+ * This function should only be invoked by the just recently
+ * newly active IMMD. It is only relevant for fail-over, not
+ * for switch-over (si-swap) since for a switch-over the old
+ * active would never drop sending node-down messages.
+ *
+ * Return Values : -
+ *
+ * Notes : None.
+ *****************************************************************************/
+void immd_pending_payload_discards(IMMD_CB *cb)
+{
+ IMMSV_EVT send_evt;
+ char *tmpData = NULL;
+ NCS_UBAID uba;
+ TRACE_ENTER();
+
+ osafassert(cb->ha_state == SA_AMF_HA_ACTIVE);
+
+ IMMD_IMMND_DETACHED_NODE *detached_node = cb->detached_nodes;
+
+ while (detached_node) {
+ if(!cb->immd_remote_up && detached_node->epoch ==
cb->mRulingEpoch) {
+ LOG_NO("Old active NOT present => send discard node
payload %x",
+ detached_node->node_id);
+
+ memset(&send_evt, 0, sizeof(IMMSV_EVT));
+ send_evt.type = IMMSV_EVT_TYPE_IMMND;
+ send_evt.info.immnd.type = IMMND_EVT_D2ND_DISCARD_NODE;
+ send_evt.info.immnd.info.ctrl.nodeId =
detached_node->node_id;
+ send_evt.info.immnd.info.ctrl.ndExecPid =
detached_node->immnd_execPid;
+
+ osafassert(ncs_enc_init_space(&uba) ==
NCSCC_RC_SUCCESS);
+ osafassert(immsv_evt_enc(&send_evt, &uba) ==
NCSCC_RC_SUCCESS);
+
+ int32_t size = uba.ttl;
+ tmpData = malloc(size);
+ osafassert(tmpData);
+ char *data = m_MMGR_DATA_AT_START(uba.start, size,
tmpData);
+
+ memset(&send_evt, 0, sizeof(IMMSV_EVT));
+ send_evt.type = IMMSV_EVT_TYPE_IMMD;
+ send_evt.info.immd.type = 0;
+ send_evt.info.immd.info.fevsReq.msg.size = size;
+ send_evt.info.immd.info.fevsReq.msg.buf = data;
+
+ if (immd_evt_proc_fevs_req(cb, &(send_evt.info.immd),
NULL, false)
+ != NCSCC_RC_SUCCESS) {
+ LOG_ER("Failed to send discard node message
over FEVS");
+ }
+
+ free(tmpData);
+ }
+
+ LOG_IN("Removing pending discard for node:%x epoch:%u",
+ detached_node->node_id, detached_node->epoch);
+ cb->detached_nodes = detached_node->next;
+ detached_node->next = NULL;
+ free(detached_node);
+ detached_node = cb->detached_nodes;
+ }
+}
+
/****************************************************************************
* Name : immd_cb_dump
*
diff --git a/osaf/services/saf/immsv/immd/immd_proc.h
b/osaf/services/saf/immsv/immd/immd_proc.h
--- a/osaf/services/saf/immsv/immd/immd_proc.h
+++ b/osaf/services/saf/immsv/immd/immd_proc.h
@@ -33,6 +33,8 @@ void immd_proc_rebroadcast_fevs(IMMD_CB
uint32_t immd_process_immnd_down(IMMD_CB *cb, IMMD_IMMND_INFO_NODE *node, bool
active);
+void immd_pending_payload_discards(IMMD_CB *cb);
+
void immd_cb_dump(void);
uint32_t immd_mbcsv_chgrole(IMMD_CB *cb);
diff --git a/osaf/services/saf/immsv/immd/immd_sbevt.c
b/osaf/services/saf/immsv/immd/immd_sbevt.c
--- a/osaf/services/saf/immsv/immd/immd_sbevt.c
+++ b/osaf/services/saf/immsv/immd/immd_sbevt.c
@@ -182,6 +182,30 @@ uint32_t immd_process_node_accept(IMMD_C
LOG_IN("Corrected execPid for immnd node info");
}
}
+
+ if(!(ctrl->canBeCoord)) { /* payload node */
+ /* Remove the node-id from the list of detached
payloads. */
+ IMMD_IMMND_DETACHED_NODE *detached_node =
cb->detached_nodes;
+ IMMD_IMMND_DETACHED_NODE **prev = &(cb->detached_nodes);
+ while(detached_node) {
+ if(detached_node->node_id == ctrl->nodeId) {
+ *prev = detached_node->next;
+ detached_node->next = NULL;
+ break;/* out of while */
+ }
+
+ /* next iteration */
+ prev = &(detached_node->next);
+ detached_node = detached_node->next;
+ }
+
+ if(detached_node) {
+ free(detached_node);
+ LOG_IN("SBY: Received node-accept from active
IMMD for "
+ "payload node %x, discarding pending
removal record.",
+ ctrl->nodeId);
+ }
+ }
} else {
LOG_IN("Standby IMMD could not find node with nodeId:%x",
ctrl->nodeId);
}
------------------------------------------------------------------------------
How ServiceNow helps IT people transform IT departments:
1. Consolidate legacy IT systems to a single system of record for IT
2. Standardize and globalize service processes across IT
3. Implement zero-touch automation to replace manual, redundant tasks
http://pubads.g.doubleclick.net/gampad/clk?id=51271111&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel