osaf/services/saf/immsv/immd/immd_cb.h    |   10 ++-
 osaf/services/saf/immsv/immd/immd_evt.c   |    1 +
 osaf/services/saf/immsv/immd/immd_proc.c  |  104 +++++++++++++++++++++++++++++-
 osaf/services/saf/immsv/immd/immd_proc.h  |    2 +
 osaf/services/saf/immsv/immd/immd_sbevt.c |   24 ++++++
 5 files changed, 138 insertions(+), 3 deletions(-)


Standby IMMD records IMMND down events for payloads including the
epoch it occurred in. The recorded payload down events are discarded
by standby IMMD when a new IMMND with same node-id is introduced.
At failover, the new active IMMD will generate a discard-node
event for any recorded payload down events that are still in the
same epoch. Normally these discard-node events will be redundant.
In rare cases they plug the hole that this ticket reported.

diff --git a/osaf/services/saf/immsv/immd/immd_cb.h 
b/osaf/services/saf/immsv/immd/immd_cb.h
--- a/osaf/services/saf/immsv/immd/immd_cb.h
+++ b/osaf/services/saf/immsv/immd/immd_cb.h
@@ -53,13 +53,20 @@ typedef struct immd_immnd_info_node {
 
        /*ABT below corresponds to old ImmEvs::NodeInfo */
        int immnd_execPid;
-       int epoch;
+       unsigned int epoch;
        bool syncRequested;
        bool isOnController;
        bool isCoord;
        bool syncStarted;
 } IMMD_IMMND_INFO_NODE;
 
+typedef struct immd_immnd_detached_node { /* IMMD SBY tracking of departed 
payload */
+       NODE_ID node_id;
+       int immnd_execPid;
+       unsigned int epoch;
+       struct immd_immnd_detached_node *next;
+} IMMD_IMMND_DETACHED_NODE;
+
 typedef struct immd_cb_tag {
        SYSF_MBX mbx;
        SaNameT comp_name;
@@ -117,6 +124,7 @@ typedef struct immd_cb_tag {
        IMMD_SAVED_FEVS_MSG *saved_msgs;
 
        SaImmRepositoryInitModeT mRim; /* Should be the rim obtained from 
coord. */
+       IMMD_IMMND_DETACHED_NODE *detached_nodes; /* IMMD SBY list of recently 
departed payloads */
 } IMMD_CB;
 
 uint32_t immd_immnd_info_tree_init(IMMD_CB *cb);
diff --git a/osaf/services/saf/immsv/immd/immd_evt.c 
b/osaf/services/saf/immsv/immd/immd_evt.c
--- a/osaf/services/saf/immsv/immd/immd_evt.c
+++ b/osaf/services/saf/immsv/immd/immd_evt.c
@@ -1978,6 +1978,7 @@ static uint32_t immd_evt_proc_lga_callba
                /* Change of role to active => We may need to elect new coord */
                immd_proc_elect_coord(cb, true);
                immd_db_purge_fevs(cb);
+               immd_pending_payload_discards(cb); /*Ensure node down for 
payloads.*/
        }
 done:
        TRACE_LEAVE();
diff --git a/osaf/services/saf/immsv/immd/immd_proc.c 
b/osaf/services/saf/immsv/immd/immd_proc.c
--- a/osaf/services/saf/immsv/immd/immd_proc.c
+++ b/osaf/services/saf/immsv/immd/immd_proc.c
@@ -439,17 +439,117 @@ uint32_t immd_process_immnd_down(IMMD_CB
 
                        free(tmpData);
                }
+       } else if(!(immnd_info->isOnController)) {
+               /* Standby NOT immediately sending redundant D2ND_DISCARD_NODE 
in this case.
+                  But will record any payload down event in case the active SC 
is included
+                  in a burst of node downs. See ticket #563. The active IMMD 
may be going
+                  down together with many payload nodes, such that the active 
IMMD never has
+                  time to generate the discard node message for all payloads. 
This will be
+                  detected if this (standby) IMMD becomes active in close time 
proximity.
+                  See immd_pending_payload_discards() below.
+                */
+               
+               LOG_IN("Standby IMMD recording IMMND DOWN for node %x", 
immnd_info->immnd_key);
+               IMMD_IMMND_DETACHED_NODE *detached_node = calloc(1, 
sizeof(IMMD_IMMND_DETACHED_NODE));
+               osafassert(detached_node);
+               detached_node->node_id = immnd_info->immnd_key;
+               detached_node->immnd_execPid = immnd_info->immnd_execPid;
+               detached_node->epoch = immnd_info->epoch;
+               detached_node->next = cb->detached_nodes;
+               cb->detached_nodes = detached_node;
        }
 
        /*We remove the node for the lost IMMND on both active and standby. */
-       TRACE_5("Removing node key:%u dest:%u", immnd_info->immnd_key,
-               m_NCS_NODE_ID_FROM_MDS_DEST(immnd_info->immnd_dest));
+       TRACE_5("Removing node id:%x", immnd_info->immnd_key);
        immd_immnd_info_node_delete(cb, immnd_info);
        immd_cb_dump();
        TRACE_LEAVE();
        return NCSCC_RC_SUCCESS;
 }
 
+
+/****************************************************************************
+ * Name          : immd_pending_payload_discards
+ *
+ * Description   : Send possibly redundant discard-node message to IMMNDs for
+ *                 payload nodes (IMMNDs) that have departed and not returned.
+ *                 This is needed to plug the small hole that exists in the
+ *                 handling of IMMND node down, when the current active IMMD
+ *                 is being taken down concurrently with several payloads.
+ *
+ *                 The active IMMD may then be pulled down after having 
+ *                 received the IMMND MDS down event for the payloads, but
+ *                 before having created or sent the fevs message broadcasting
+ *                 each node down to the IMMND cluster.
+ *
+ *                 The list of detached nodes is screened for having occcurred
+ *                 in the current epoch. This is an extra guard against the 
new 
+ *                 active shooting down a recently restarted payload. The list
+ *                 is also pruned in immd_sbevt.c when it receives info about
+ *                 a payload having re-joined.
+ *
+ *                 This function should only be invoked by the just recently
+ *                 newly active IMMD. It is only relevant for fail-over, not
+ *                 for switch-over (si-swap) since for a switch-over the old
+ *                 active would never drop sending node-down messages. 
+ *
+ * Return Values : -
+ *
+ * Notes         : None.
+ *****************************************************************************/
+void immd_pending_payload_discards(IMMD_CB *cb)
+{
+       IMMSV_EVT send_evt;
+       char *tmpData = NULL;
+       NCS_UBAID uba;
+       TRACE_ENTER();
+
+       osafassert(cb->ha_state == SA_AMF_HA_ACTIVE);
+
+       IMMD_IMMND_DETACHED_NODE *detached_node = cb->detached_nodes;
+
+       while (detached_node) {
+               if(!cb->immd_remote_up && detached_node->epoch == 
cb->mRulingEpoch) {
+                       LOG_NO("Old active NOT present => send discard node 
payload %x", 
+                               detached_node->node_id);
+
+                       memset(&send_evt, 0, sizeof(IMMSV_EVT));
+                       send_evt.type = IMMSV_EVT_TYPE_IMMND;
+                       send_evt.info.immnd.type = IMMND_EVT_D2ND_DISCARD_NODE;
+                       send_evt.info.immnd.info.ctrl.nodeId = 
detached_node->node_id;
+                       send_evt.info.immnd.info.ctrl.ndExecPid = 
detached_node->immnd_execPid;
+
+                       osafassert(ncs_enc_init_space(&uba) == 
NCSCC_RC_SUCCESS);
+                       osafassert(immsv_evt_enc(&send_evt, &uba) == 
NCSCC_RC_SUCCESS);
+
+                       int32_t size = uba.ttl;
+                       tmpData = malloc(size);
+                       osafassert(tmpData);
+                       char *data = m_MMGR_DATA_AT_START(uba.start, size, 
tmpData);
+
+                       memset(&send_evt, 0, sizeof(IMMSV_EVT));
+                       send_evt.type = IMMSV_EVT_TYPE_IMMD;
+                       send_evt.info.immd.type = 0;
+                       send_evt.info.immd.info.fevsReq.msg.size = size;
+                       send_evt.info.immd.info.fevsReq.msg.buf = data;
+
+                       if (immd_evt_proc_fevs_req(cb, &(send_evt.info.immd), 
NULL, false)
+                           != NCSCC_RC_SUCCESS) {
+                               LOG_ER("Failed to send discard node message 
over FEVS");
+                       }
+
+                       free(tmpData);
+               }
+
+               LOG_IN("Removing pending discard for node:%x epoch:%u", 
+                       detached_node->node_id, detached_node->epoch);
+               cb->detached_nodes = detached_node->next;
+               detached_node->next = NULL;
+               free(detached_node);
+               detached_node = cb->detached_nodes;
+       }
+}
+
 /****************************************************************************
  * Name          : immd_cb_dump
  *
diff --git a/osaf/services/saf/immsv/immd/immd_proc.h 
b/osaf/services/saf/immsv/immd/immd_proc.h
--- a/osaf/services/saf/immsv/immd/immd_proc.h
+++ b/osaf/services/saf/immsv/immd/immd_proc.h
@@ -33,6 +33,8 @@ void immd_proc_rebroadcast_fevs(IMMD_CB 
 
 uint32_t immd_process_immnd_down(IMMD_CB *cb, IMMD_IMMND_INFO_NODE *node, bool 
active);
 
+void immd_pending_payload_discards(IMMD_CB *cb);
+
 void immd_cb_dump(void);
 
 uint32_t immd_mbcsv_chgrole(IMMD_CB *cb);
diff --git a/osaf/services/saf/immsv/immd/immd_sbevt.c 
b/osaf/services/saf/immsv/immd/immd_sbevt.c
--- a/osaf/services/saf/immsv/immd/immd_sbevt.c
+++ b/osaf/services/saf/immsv/immd/immd_sbevt.c
@@ -182,6 +182,30 @@ uint32_t immd_process_node_accept(IMMD_C
                                LOG_IN("Corrected execPid for immnd node info");
                        }
                }
+
+               if(!(ctrl->canBeCoord)) { /* payload node */
+                       /* Remove the node-id from the list of detached 
payloads. */
+                       IMMD_IMMND_DETACHED_NODE *detached_node = 
cb->detached_nodes;
+                       IMMD_IMMND_DETACHED_NODE **prev = &(cb->detached_nodes);
+                       while(detached_node) {
+                               if(detached_node->node_id == ctrl->nodeId) {
+                                       *prev = detached_node->next;
+                                       detached_node->next = NULL;
+                                       break;/* out of while */
+                               }
+
+                               /* next iteration */
+                               prev = &(detached_node->next);
+                               detached_node = detached_node->next;
+                       }
+
+                       if(detached_node) {
+                               free(detached_node);
+                               LOG_IN("SBY: Received node-accept from active 
IMMD for "
+                                       "payload node %x, discarding pending 
removal record.",
+                                       ctrl->nodeId);
+                       }
+               }
        } else {
                LOG_IN("Standby IMMD could not find node with nodeId:%x", 
ctrl->nodeId);
        }

------------------------------------------------------------------------------
How ServiceNow helps IT people transform IT departments:
1. Consolidate legacy IT systems to a single system of record for IT
2. Standardize and globalize service processes across IT
3. Implement zero-touch automation to replace manual, redundant tasks
http://pubads.g.doubleclick.net/gampad/clk?id=51271111&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to