Hi Hung,

Reviewed and tested the patch.
Ack.

/Neel.

On Tuesday 08 March 2016 10:53 AM, Hung Nguyen wrote:
>   osaf/services/saf/immsv/immnd/immnd_evt.c  |  37 
> ++++++++++++++++++++---------
>   osaf/services/saf/immsv/immnd/immnd_proc.c |  34 +++++++++++++++++++++++++--
>   2 files changed, 56 insertions(+), 15 deletions(-)
>
>
> Set 'mIsCoord' to false when headless to avoid coordinator from restarting.
> Also handle the cases when headless occurs before/during sync.
>
> diff --git a/osaf/services/saf/immsv/immnd/immnd_evt.c 
> b/osaf/services/saf/immsv/immnd/immnd_evt.c
> --- a/osaf/services/saf/immsv/immnd/immnd_evt.c
> +++ b/osaf/services/saf/immsv/immnd/immnd_evt.c
> @@ -10194,24 +10194,38 @@ static uint32_t immnd_evt_proc_mds_evt(I
>                       }
>                       exit(1);
>               } else { /* SC ABSENCE ALLOWED */
> +                     cb->mIntroduced = 2;
>                       LOG_WA("SC Absence IS allowed:%u IMMD service is DOWN", 
> cb->mScAbsenceAllowed);
>                       if(cb->mIsCoord) {
> -                             /* Note that normally the coord will reside at 
> SCs so this branch will
> -                                only be relevant if REPEATED toal scAbsence 
> occurs. After SC absence
> -                                and subsequent return of SC, the coord will 
> be elected at a payload.
> -                                That coord will be active untill restart of 
> that payload..
> -                                unless we add functionality for the payload 
> coord to restart after
> -                                a few minutes .. ?
> -                             */
> -                             LOG_WA("This IMMND coord has to exit allowing 
> restarted IMMD to select new coord");
> -                             if(cb->mState < IMM_SERVER_SYNC_SERVER) {
> -                                     immnd_ackToNid(NCSCC_RC_FAILURE);
> +                             cb->mIsCoord = false;
> +
> +                             if (cb->mSyncRequested) {
> +                                     /* Just got sync requested from IMMD, 
> nothing happened yet */
> +                                     cb->mSyncRequested = false;
> +
> +                             } else if (cb->mState == IMM_SERVER_SYNC_SERVER 
> && cb->mPendSync) {
> +                                     /* Sent out sync-start msg but sync 
> didn't start yet, revert the state to IMM_SERVER_READY */
> +                                     cb->mPendSync = false;
> +                                     cb->mState = IMM_SERVER_READY;
> +                                     LOG_NO("SERVER STATE: 
> IMM_SERVER_SYNC_SERVER --> IMM_SERVER_READY");
> +
> +                             } else if (cb->mState == IMM_SERVER_SYNC_SERVER 
> && (cb->syncPid > 0)) {
> +                                     /* Sync started, kill sync process to 
> trigger sync abort in immnd_proc_server() */
> +                                     osafassert(!cb->mPendSync);
> +                                     kill(cb->syncPid, SIGTERM);
>                               }
> -                             exit(1);
> +
>                       } else if(cb->mState <= IMM_SERVER_LOADING_PENDING) {
>                               /* Reset state in payloads that had not joined. 
> No need to restart. */
>                               LOG_IN("Resetting IMMND state from %u to 
> IMM_SERVER_ANONYMOUS", cb->mState);
>                               cb->mState = IMM_SERVER_ANONYMOUS;
> +
> +                     } else if (cb->mState == IMM_SERVER_READY && 
> immModel_immNotWritable(cb)) {
> +                             /* This SC absence allowed case, when IMMD is 
> down and
> +                              The sync is in progress. Veteran nodes Other 
> than the syncing node,
> +                              has to change the node state from 
> NODE_R_AVAILABLE to NODE_FULLY_AVAILABLE*/
> +                             immnd_abortSync(cb);
> +
>                       } else if(cb->mState < IMM_SERVER_READY) {
>                               LOG_WA("IMMND was being synced or loaded (%u), 
> has to restart", cb->mState);
>                               if(cb->mState < IMM_SERVER_SYNC_SERVER) {
> @@ -10220,7 +10234,6 @@ static uint32_t immnd_evt_proc_mds_evt(I
>                               exit(1);
>                       }
>               }
> -             cb->mIntroduced = 2;
>               LOG_NO("IMMD SERVICE IS DOWN, HYDRA IS CONFIGURED => 
> UNREGISTERING IMMND form MDS");
>               immnd_mds_unregister(cb);
>               /* Discard local clients ...  */
> diff --git a/osaf/services/saf/immsv/immnd/immnd_proc.c 
> b/osaf/services/saf/immsv/immnd/immnd_proc.c
> --- a/osaf/services/saf/immsv/immnd/immnd_proc.c
> +++ b/osaf/services/saf/immsv/immnd/immnd_proc.c
> @@ -872,7 +872,7 @@ void immnd_abortSync(IMMND_CB *cb)
>       memset(&send_evt, '\0', sizeof(IMMSV_EVT));
>       TRACE_ENTER();
>       TRACE("ME:%u RE:%u", cb->mMyEpoch, cb->mRulingEpoch);
> -     osafassert(cb->mIsCoord);
> +     osafassert(cb->mIsCoord || (cb->mScAbsenceAllowed && cb->mIntroduced == 
> 2 ));
>       cb->mPendSync = 0;
>       if(cb->mSyncFinalizing) {
>               cb->mSyncFinalizing = 0x0;
> @@ -898,6 +898,12 @@ void immnd_abortSync(IMMND_CB *cb)
>               LOG_ER("immnd_abortSync not clean on epoch: RE:%u ME:%u", 
> cb->mRulingEpoch, cb->mMyEpoch);
>       }
>   
> +     /* Skip broadcasting sync abort msg when SC are absent */
> +     if (cb->mScAbsenceAllowed && cb->mIntroduced == 2) {
> +             TRACE_LEAVE();
> +             return;
> +     }
> +
>       while (!immnd_is_immd_up(cb) && (retryCount++ < 20)) {
>               LOG_WA("Coord blocked in sending ABORT_SYNC because IMMD is 
> DOWN %u", retryCount);
>               sleep(1);
> @@ -1319,6 +1325,10 @@ void immnd_proc_global_abort_ccb(IMMND_C
>   
>   static SaBoolT immnd_ccbsTerminated(IMMND_CB *cb, SaUint32T duration, 
> SaBoolT* pbeImmndDeadlock)
>   {
> +     if (cb->mIntroduced == 2) {
> +             /* Return true to enter phase 2 or phase 3 of SYNC_SERVER */
> +             return SA_TRUE;
> +     }
>       osafassert(cb->mIsCoord);
>       osafassert(pbeImmndDeadlock);
>       (*pbeImmndDeadlock) = SA_FALSE;
> @@ -1999,9 +2009,14 @@ uint32_t immnd_proc_server(uint32_t *tim
>                       /*Phase 2 */
>                       if (cb->syncPid <= 0) {
>                               /*Fork sync-agent */
> -                             cb->syncPid = immnd_forkSync(cb);
> +                             /* When SC are absent, we don't fork to trigger 
> abortSync */
> +                             if (cb->mIntroduced != 2) {
> +                                     cb->syncPid = immnd_forkSync(cb);
> +                             }
>                               if (cb->syncPid <= 0) {
> -                                     LOG_ER("Failed to fork sync process");
> +                                     if (cb->mIntroduced != 2) {
> +                                             LOG_ER("Failed to fork sync 
> process");
> +                                     }
>                                       cb->syncPid = 0;
>                                       cb->mStep = 0;
>                                       cb->mJobStart = now;
> @@ -2063,6 +2078,19 @@ uint32_t immnd_proc_server(uint32_t *tim
>   
>               if(cb->mIntroduced == 2) {
>                       immnd_introduceMe(cb);
> +                     if(cb->pbePid > 0) {
> +                             /* Check if pbe process is terminated.
> +                              * Will send SIGKILL if it's not terminated. */
> +                             int status = 0;
> +                             if (waitpid(cb->pbePid, &status, WNOHANG) > 0) {
> +                                     cb->pbePid = 0;
> +                                     LOG_NO("PBE has terminated due to SC 
> absence");
> +                             } else {
> +                                     cb->pbePid = 0;
> +                                     LOG_WA("SC were absent and PBE appears 
> hung, sending SIGKILL");
> +                                     kill(cb->pbePid, SIGKILL);
> +                             }
> +                     }
>                       break;
>               }
>   


------------------------------------------------------------------------------
Transform Data into Opportunity.
Accelerate data analysis in your applications with
Intel Data Analytics Acceleration Library.
Click to learn more.
http://pubads.g.doubleclick.net/gampad/clk?id=278785351&iu=/4140
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to