- Incorrect counting lost nodes cause new coordinator
postpone sync waiting for a number of node bigger than cluster size.
- Correct counting lost nodes by a set of lost node Id.
---
 src/imm/immnd/ImmModel.cc  | 14 ++++++++++++++
 src/imm/immnd/immnd_evt.c  |  4 ++--
 src/imm/immnd/immnd_init.h |  4 ++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/imm/immnd/ImmModel.cc b/src/imm/immnd/ImmModel.cc
index 631597b8a..00d7f4794 100644
--- a/src/imm/immnd/ImmModel.cc
+++ b/src/imm/immnd/ImmModel.cc
@@ -524,6 +524,7 @@ typedef std::map<std::string, ObjectSet> MissingParentsMap;
 
 // Local variables
 
+static std::set<SaUint32T> sDiscardNodeSet;
 static ClassMap sClassMap;
 static AdminOwnerVector sOwnerVector;
 static CcbVector sCcbVector;
@@ -1364,12 +1365,25 @@ void immModel_getCcbIdsForOrigCon(IMMND_CB* cb, 
SaUint32T deadCon,
   osafassert(ix == (*arrSize));
 }
 
+void immModel_resetDiscardNodes(IMMND_CB* cb) {
+  cb->mLostNodes = 0;
+  sDiscardNodeSet.clear();
+}
+
+void immModel_eraseDiscardNode(SaUint32T nodeId) {
+  sDiscardNodeSet.erase(nodeId);
+}
+
 void immModel_discardNode(IMMND_CB* cb, SaUint32T nodeId, SaUint32T* arrSize,
                           SaUint32T** ccbIdArr, SaUint32T* globArrSize,
                           SaUint32T** globccbIdArr) {
   ConnVector cv, gv;
   ConnVector::iterator cvi, gvi;
   unsigned int ix = 0;
+  if (sDiscardNodeSet.find(nodeId) == sDiscardNodeSet.end()) {
+    sDiscardNodeSet.insert(nodeId);
+    cb->mLostNodes++;
+  }
   ImmModel::instance(&cb->immModel)
       ->discardNode(nodeId, cv, gv, cb->mIsCoord, false);
   *arrSize = (SaUint32T)cv.size();
diff --git a/src/imm/immnd/immnd_evt.c b/src/imm/immnd/immnd_evt.c
index dfef6c0a5..af8f5876a 100644
--- a/src/imm/immnd/immnd_evt.c
+++ b/src/imm/immnd/immnd_evt.c
@@ -10321,7 +10321,7 @@ static uint32_t immnd_evt_proc_start_sync(IMMND_CB *cb, 
IMMND_EVT *evt,
                                   Nodes. This is mostly relevant for "standby"
                                   i.e. the non-coord immnd which is on an SC.
                                 */
-                               cb->mLostNodes = 0;
+                               immModel_resetDiscardNodes(cb);
                        }
                }
                immModel_prepareForSync(cb, cb->mSync);
@@ -10488,6 +10488,7 @@ static uint32_t immnd_evt_proc_sync_req(IMMND_CB *cb, 
IMMND_EVT *evt,
                cb->mSyncRequested = true;
                if (cb->mLostNodes > 0) {
                        cb->mLostNodes--;
+                       immModel_eraseDiscardNode(evt->info.ctrl.nodeId);
                }
                /*osafassert(cb->mRulingEpoch == evt->info.ctrl.rulingEpoch); */
                TRACE_2("At COORD: My Ruling Epoch:%u Cenral Ruling Epoch:%u",
@@ -10989,7 +10990,6 @@ static void immnd_evt_proc_discard_node(IMMND_CB *cb, 
IMMND_EVT *evt,
        /* We should remember the nodeId/pid pair to avoid a redundant message
           causing a newly reattached node being discarded.
         */
-       cb->mLostNodes++;
        immModel_discardNode(cb, evt->info.ctrl.nodeId, &arrSize, &idArr,
                             &globArrSize, &globIdArr);
        if (globArrSize) {
diff --git a/src/imm/immnd/immnd_init.h b/src/imm/immnd/immnd_init.h
index 9a3f70072..0732f43f0 100644
--- a/src/imm/immnd/immnd_init.h
+++ b/src/imm/immnd/immnd_init.h
@@ -154,6 +154,10 @@ bool immModel_ccbAbort(IMMND_CB *cb, SaUint32T ccbId, 
SaUint32T *arrSize,
 void immModel_getCcbIdsForOrigCon(IMMND_CB *cb, SaUint32T origConn,
                                   SaUint32T *arrSize, SaUint32T **ccbIdArr);
 
+void immModel_resetDiscardNodes(IMMND_CB* cb);
+
+void immModel_eraseDiscardNode(SaUint32T nodeId);
+
 void immModel_discardNode(IMMND_CB *cb, SaUint32T nodeId, SaUint32T *arrSize,
                           SaUint32T **ccbIdArr, SaUint32T *globArrSize,
                           SaUint32T **globccbIdArr);
-- 
2.25.1



_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to