Nice work Hung,

Ack from me with one question inline.


-----Original Message-----
From: Hung Nguyen [mailto:[email protected]] 
Sent: Tuesday, March 08, 2016 6:23 AM
To: Zoran Milinkovic; [email protected]
Cc: [email protected]
Subject: [PATCH 1 of 1] imm: Remove coordinator role when SC absence happens 
[#1692]

 osaf/services/saf/immsv/immnd/immnd_evt.c  |  37 ++++++++++++++++++++---------
 osaf/services/saf/immsv/immnd/immnd_proc.c |  34 +++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 15 deletions(-)


Set 'mIsCoord' to false when headless to avoid coordinator from restarting.
Also handle the cases when headless occurs before/during sync.

diff --git a/osaf/services/saf/immsv/immnd/immnd_evt.c 
b/osaf/services/saf/immsv/immnd/immnd_evt.c
--- a/osaf/services/saf/immsv/immnd/immnd_evt.c
+++ b/osaf/services/saf/immsv/immnd/immnd_evt.c
@@ -10194,24 +10194,38 @@ static uint32_t immnd_evt_proc_mds_evt(I
                        }
                        exit(1);
                } else { /* SC ABSENCE ALLOWED */
+                       cb->mIntroduced = 2;
                        LOG_WA("SC Absence IS allowed:%u IMMD service is DOWN", 
cb->mScAbsenceAllowed);
                        if(cb->mIsCoord) {
-                               /* Note that normally the coord will reside at 
SCs so this branch will
-                                  only be relevant if REPEATED toal scAbsence 
occurs. After SC absence
-                                  and subsequent return of SC, the coord will 
be elected at a payload.
-                                  That coord will be active untill restart of 
that payload..
-                                  unless we add functionality for the payload 
coord to restart after
-                                  a few minutes .. ?
-                               */
-                               LOG_WA("This IMMND coord has to exit allowing 
restarted IMMD to select new coord");
-                               if(cb->mState < IMM_SERVER_SYNC_SERVER) {
-                                       immnd_ackToNid(NCSCC_RC_FAILURE);
+                               cb->mIsCoord = false;
+
+                               if (cb->mSyncRequested) {
+                                       /* Just got sync requested from IMMD, 
nothing happened yet */
+                                       cb->mSyncRequested = false;
+
+                               } else if (cb->mState == IMM_SERVER_SYNC_SERVER 
&& cb->mPendSync) {
+                                       /* Sent out sync-start msg but sync 
didn't start yet, revert the state to IMM_SERVER_READY */
+                                       cb->mPendSync = false;
+                                       cb->mState = IMM_SERVER_READY;
+                                       LOG_NO("SERVER STATE: 
IMM_SERVER_SYNC_SERVER --> IMM_SERVER_READY");
+
+                               } else if (cb->mState == IMM_SERVER_SYNC_SERVER 
&& (cb->syncPid > 0)) {
+                                       /* Sync started, kill sync process to 
trigger sync abort in immnd_proc_server() */
+                                       osafassert(!cb->mPendSync);
+                                       kill(cb->syncPid, SIGTERM);
                                }

[Zoran] Shouldn't other cb->mState states be handled here ? For example 
IMM_SERVER_LOADING_SERVER.
I think the last 'else' should be the same as replaced 'IF' statement 
(cb->mState < IMM_SERVER_SYNC_SERVER).

Thanks,
Zoran

-                               exit(1);
+
                        } else if(cb->mState <= IMM_SERVER_LOADING_PENDING) {
                                /* Reset state in payloads that had not joined. 
No need to restart. */
                                LOG_IN("Resetting IMMND state from %u to 
IMM_SERVER_ANONYMOUS", cb->mState);
                                cb->mState = IMM_SERVER_ANONYMOUS;
+
+                       } else if (cb->mState == IMM_SERVER_READY && 
immModel_immNotWritable(cb)) {
+                               /* This SC absence allowed case, when IMMD is 
down and
+                                The sync is in progress. Veteran nodes Other 
than the syncing node,
+                                has to change the node state from 
NODE_R_AVAILABLE to NODE_FULLY_AVAILABLE*/
+                               immnd_abortSync(cb);
+
                        } else if(cb->mState < IMM_SERVER_READY) {
                                LOG_WA("IMMND was being synced or loaded (%u), 
has to restart", cb->mState);
                                if(cb->mState < IMM_SERVER_SYNC_SERVER) {
@@ -10220,7 +10234,6 @@ static uint32_t immnd_evt_proc_mds_evt(I
                                exit(1);
                        }
                }
-               cb->mIntroduced = 2;
                LOG_NO("IMMD SERVICE IS DOWN, HYDRA IS CONFIGURED => 
UNREGISTERING IMMND form MDS");
                immnd_mds_unregister(cb);
                /* Discard local clients ...  */
diff --git a/osaf/services/saf/immsv/immnd/immnd_proc.c 
b/osaf/services/saf/immsv/immnd/immnd_proc.c
--- a/osaf/services/saf/immsv/immnd/immnd_proc.c
+++ b/osaf/services/saf/immsv/immnd/immnd_proc.c
@@ -872,7 +872,7 @@ void immnd_abortSync(IMMND_CB *cb)
        memset(&send_evt, '\0', sizeof(IMMSV_EVT));
        TRACE_ENTER();
        TRACE("ME:%u RE:%u", cb->mMyEpoch, cb->mRulingEpoch);
-       osafassert(cb->mIsCoord);
+       osafassert(cb->mIsCoord || (cb->mScAbsenceAllowed && cb->mIntroduced == 
2 ));
        cb->mPendSync = 0;
        if(cb->mSyncFinalizing) {
                cb->mSyncFinalizing = 0x0;
@@ -898,6 +898,12 @@ void immnd_abortSync(IMMND_CB *cb)
                LOG_ER("immnd_abortSync not clean on epoch: RE:%u ME:%u", 
cb->mRulingEpoch, cb->mMyEpoch);
        }
 
+       /* Skip broadcasting sync abort msg when SC are absent */
+       if (cb->mScAbsenceAllowed && cb->mIntroduced == 2) {
+               TRACE_LEAVE();
+               return;
+       }
+
        while (!immnd_is_immd_up(cb) && (retryCount++ < 20)) {
                LOG_WA("Coord blocked in sending ABORT_SYNC because IMMD is 
DOWN %u", retryCount);
                sleep(1);
@@ -1319,6 +1325,10 @@ void immnd_proc_global_abort_ccb(IMMND_C
 
 static SaBoolT immnd_ccbsTerminated(IMMND_CB *cb, SaUint32T duration, SaBoolT* 
pbeImmndDeadlock)
 {
+       if (cb->mIntroduced == 2) {
+               /* Return true to enter phase 2 or phase 3 of SYNC_SERVER */
+               return SA_TRUE;
+       }
        osafassert(cb->mIsCoord);
        osafassert(pbeImmndDeadlock);
        (*pbeImmndDeadlock) = SA_FALSE;
@@ -1999,9 +2009,14 @@ uint32_t immnd_proc_server(uint32_t *tim
                        /*Phase 2 */
                        if (cb->syncPid <= 0) {
                                /*Fork sync-agent */
-                               cb->syncPid = immnd_forkSync(cb);
+                               /* When SC are absent, we don't fork to trigger 
abortSync */
+                               if (cb->mIntroduced != 2) {
+                                       cb->syncPid = immnd_forkSync(cb);
+                               }
                                if (cb->syncPid <= 0) {
-                                       LOG_ER("Failed to fork sync process");
+                                       if (cb->mIntroduced != 2) {
+                                               LOG_ER("Failed to fork sync 
process");
+                                       }
                                        cb->syncPid = 0;
                                        cb->mStep = 0;
                                        cb->mJobStart = now;
@@ -2063,6 +2078,19 @@ uint32_t immnd_proc_server(uint32_t *tim
 
                if(cb->mIntroduced == 2) {
                        immnd_introduceMe(cb);
+                       if(cb->pbePid > 0) {
+                               /* Check if pbe process is terminated.
+                                * Will send SIGKILL if it's not terminated. */
+                               int status = 0;
+                               if (waitpid(cb->pbePid, &status, WNOHANG) > 0) {
+                                       cb->pbePid = 0;
+                                       LOG_NO("PBE has terminated due to SC 
absence");
+                               } else {
+                                       cb->pbePid = 0;
+                                       LOG_WA("SC were absent and PBE appears 
hung, sending SIGKILL");
+                                       kill(cb->pbePid, SIGKILL);
+                               }
+                       }
                        break;
                }
 

------------------------------------------------------------------------------
Transform Data into Opportunity.
Accelerate data analysis in your applications with
Intel Data Analytics Acceleration Library.
Click to learn more.
http://pubads.g.doubleclick.net/gampad/clk?id=278785351&iu=/4140
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to