Re: [devel] [PATCH 1 of 1] imm: Remove coordinator role when SC absence happens [#1625]

Neelakanta Reddy Thu, 25 Feb 2016 21:30:27 -0800

Hi Hung,

Reviewed and tested the patch.
Following are the comments:


1. This patch has to go in a separate defect,After #1625 is pushed.
2. re-publish the patch with the comments when the defect is opened
3. Comments inline.

Thanks,
Neel.

On Thursday 25 February 2016 05:29 PM, Hung Nguyen wrote:
>   osaf/services/saf/immsv/immnd/immnd_evt.c  |  13 +------
>   osaf/services/saf/immsv/immnd/immnd_init.h |   1 +
>   osaf/services/saf/immsv/immnd/immnd_proc.c |  55 
> ++++++++++++++++++++++++++++-
>   3 files changed, 55 insertions(+), 14 deletions(-)
>
>
> Set 'mIsCoord' to false when headless to avoid coordinator from restarting.
> Also handle the cases when headless occurs before/during sync.
>
> diff --git a/osaf/services/saf/immsv/immnd/immnd_evt.c 
> b/osaf/services/saf/immsv/immnd/immnd_evt.c
> --- a/osaf/services/saf/immsv/immnd/immnd_evt.c
> +++ b/osaf/services/saf/immsv/immnd/immnd_evt.c
> @@ -10196,18 +10196,7 @@ static uint32_t immnd_evt_proc_mds_evt(I
>               } else { /* SC ABSENCE ALLOWED */
>                       LOG_WA("SC Absence IS allowed:%u IMMD service is DOWN", 
> cb->mScAbsenceAllowed);
>                       if(cb->mIsCoord) {
> -                             /* Note that normally the coord will reside at 
> SCs so this branch will
> -                                only be relevant if REPEATED toal scAbsence 
> occurs. After SC absence
> -                                and subsequent return of SC, the coord will 
> be elected at a payload.
> -                                That coord will be active untill restart of 
> that payload..
> -                                unless we add functionality for the payload 
> coord to restart after
> -                                a few minutes .. ?
> -                             */
> -                             LOG_WA("This IMMND coord has to exit allowing 
> restarted IMMD to select new coord");
> -                             if(cb->mState < IMM_SERVER_SYNC_SERVER) {
> -                                     immnd_ackToNid(NCSCC_RC_FAILURE);
> -                             }
> -                             exit(1);
> +                             immnd_proc_remove_coordinator_role(cb);
No, need of new function, add the code here.
>                       } else if(cb->mState <= IMM_SERVER_LOADING_PENDING) {
>                               /* Reset state in payloads that had not joined. 
> No need to restart. */
>                               LOG_IN("Resetting IMMND state from %u to 
> IMM_SERVER_ANONYMOUS", cb->mState);
> diff --git a/osaf/services/saf/immsv/immnd/immnd_init.h 
> b/osaf/services/saf/immsv/immnd/immnd_init.h
> --- a/osaf/services/saf/immsv/immnd/immnd_init.h
> +++ b/osaf/services/saf/immsv/immnd/immnd_init.h
> @@ -40,6 +40,7 @@ extern IMMND_CB *immnd_cb;
>   /* file : -  immnd_proc.c */
>   
>   void immnd_proc_discard_other_nodes(IMMND_CB *cb);
> +void immnd_proc_remove_coordinator_role(IMMND_CB *cb);
>   
>   void immnd_proc_imma_down(IMMND_CB *cb, MDS_DEST dest, NCSMDS_SVC_ID sv_id);
>   uint32_t immnd_proc_imma_discard_connection(IMMND_CB *cb, 
> IMMND_IMM_CLIENT_NODE *cl_node, bool scAbsenceAllowed);
> diff --git a/osaf/services/saf/immsv/immnd/immnd_proc.c 
> b/osaf/services/saf/immsv/immnd/immnd_proc.c
> --- a/osaf/services/saf/immsv/immnd/immnd_proc.c
> +++ b/osaf/services/saf/immsv/immnd/immnd_proc.c
> @@ -872,7 +872,7 @@ void immnd_abortSync(IMMND_CB *cb)
>       memset(&send_evt, '\0', sizeof(IMMSV_EVT));
>       TRACE_ENTER();
>       TRACE("ME:%u RE:%u", cb->mMyEpoch, cb->mRulingEpoch);
> -     osafassert(cb->mIsCoord);
> +     osafassert(cb->mIsCoord || cb->mIntroduced == 2);
osafassert(cb->mIsCoord || (cb->mScAbsenceAllowed && cb->mIntroduced == 2 );
>       cb->mPendSync = 0;
>       if(cb->mSyncFinalizing) {
>               cb->mSyncFinalizing = 0x0;
> @@ -898,6 +898,12 @@ void immnd_abortSync(IMMND_CB *cb)
>               LOG_ER("immnd_abortSync not clean on epoch: RE:%u ME:%u", 
> cb->mRulingEpoch, cb->mMyEpoch);
>       }
>   
> +     /* Skip broadcasting sync abort msg when SC are absent */
> +     if (cb->mIntroduced == 2) {
(cb->mScAbsenceAllowed && cb->mIntroduced == 2 )
> +             TRACE_LEAVE();
> +             return;
> +     }
> +
>       while (!immnd_is_immd_up(cb) && (retryCount++ < 20)) {
>               LOG_WA("Coord blocked in sending ABORT_SYNC because IMMD is 
> DOWN %u", retryCount);
>               sleep(1);
> @@ -1319,6 +1325,10 @@ void immnd_proc_global_abort_ccb(IMMND_C
>   
>   static SaBoolT immnd_ccbsTerminated(IMMND_CB *cb, SaUint32T duration, 
> SaBoolT* pbeImmndDeadlock)
>   {
> +     if (cb->mIntroduced == 2) {
> +             /* Return true to enter phase 2 or phase 3 of SYNC_SERVER */
> +             return SA_TRUE;
> +     }
>       osafassert(cb->mIsCoord);
>       osafassert(pbeImmndDeadlock);
>       (*pbeImmndDeadlock) = SA_FALSE;
> @@ -1999,7 +2009,10 @@ uint32_t immnd_proc_server(uint32_t *tim
>                       /*Phase 2 */
>                       if (cb->syncPid <= 0) {
>                               /*Fork sync-agent */
> -                             cb->syncPid = immnd_forkSync(cb);
> +                             /* When SC are absent, we don't fork to trigger 
> abortSync */
> +                             if (cb->mIntroduced != 2) {
> +                                     cb->syncPid = immnd_forkSync(cb);
> +                             }
>                               if (cb->syncPid <= 0) {
For the ER logging also ad the if condition
if(cb->mIntroduced == 2 )
>                                       LOG_ER("Failed to fork sync process");
>                                       cb->syncPid = 0;
> @@ -2063,6 +2076,19 @@ uint32_t immnd_proc_server(uint32_t *tim
>   
>               if(cb->mIntroduced == 2) {
>                       immnd_introduceMe(cb);
> +                     if(cb->pbePid > 0) {
> +                             /* Check if pbe process is terminated.
> +                              * Will send SIGKILL if it's not terminated. */
> +                             int status = 0;
> +                             if (waitpid(cb->pbePid, &status, WNOHANG) > 0) {
> +                                     cb->pbePid = 0;
> +                                     LOG_NO("PBE has terminated due to SC 
> absence");
> +                             } else if (++cb->mPbeKills > 10) { /* More than 
> 10 senconds */
> +                                     cb->pbePid = 0;
> +                                     LOG_WA("SC were absent and PBE appears 
> hung, sending SIGKILL");
> +                                     kill(cb->pbePid, SIGKILL);
> +                             }
> +                     }
>                       break;
>               }

The above  else if is not required. When the IMMD is down PBE will 
automatically exit and present in defunct.

                 if (waitpid(cb->pbePid, &status, WNOHANG) > 0) {
                     cb->pbePid = 0;
                     LOG_NO("PBE has terminated due to SC absence");
                     kill(cb->pbePid, SIGKILL);
                 }
>   
> @@ -2333,3 +2359,28 @@ void immnd_proc_discard_other_nodes(IMMN
>       cb->mPbeVeteran = SA_FALSE;
>       TRACE_LEAVE();
>   }
> +
> +void immnd_proc_remove_coordinator_role(IMMND_CB *cb)
> +{
> +     TRACE_ENTER();
> +
> +     cb->mIsCoord = false;
> +
> +     if (cb->mSyncRequested) {
> +             /* Just got sync requested from IMMD, nothing happened yet */
> +             cb->mSyncRequested = false;
> +
> +     } else if (cb->mState == IMM_SERVER_SYNC_SERVER && cb->mPendSync) {
> +             /* Sent out sync-start msg but sync didn't start yet, revert 
> the state to IMM_SERVER_READY */
> +             cb->mPendSync = false;
> +             cb->mState = IMM_SERVER_READY;
> +             LOG_NO("SERVER STATE: IMM_SERVER_SYNC_SERVER --> 
> IMM_SERVER_READY");

The above else if is not required, as the above changes will be taken 
care at immnd_proc_server
> +
> +     } else if (cb->mState == IMM_SERVER_SYNC_SERVER && (cb->syncPid > 0)) {
> +             /* Sync started, kill sync process to trigger sync abort in 
> immnd_proc_server() */
> +             osafassert(!cb->mPendSync);
> +             kill(cb->syncPid, SIGTERM);
> +     }
> +
> +     TRACE_LEAVE();
> +}
when the sync is in progress, additional check is required, when the 
non-cordinator IMMND will be R_AVAILABLE.
This has to be changed back to W_AVAILABLE. Add an if condition;
  else if(cb->mState == IMM_SERVER_READY && immModel_immNotWritable(cb)) {
                                 /* This SC absence allowed case, when 
IMMD is down and
                                    The sync is in progress. Veteran 
nodes Other than the syncing node,
                                    has to change the node state from 
NODE_R_AVAILABLE to NODE_FULLY_AVAILABLE*/
                                 immnd_abortSync(cb);
                         }

The whole function can be added as inline, may not be as separate function.

------------------------------------------------------------------------------
Site24x7 APM Insight: Get Deep Visibility into Application Performance
APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
Monitor end-to-end web transactions and take corrective actions now
Troubleshoot faster and improve end-user experience. Signup Now!
http://pubads.g.doubleclick.net/gampad/clk?id=272487151&iu=/4140
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Re: [devel] [PATCH 1 of 1] imm: Remove coordinator role when SC absence happens [#1625]

Reply via email to