Hi Zoran, Please find my answers inline.
BR, Hung Nguyen - DEK Technologies -------------------------------------------------------------------------------- From: Zoran Milinkovic [email protected] Sent: Monday, March 21, 2016 10:34PM To: Hung Nguyen, Neelakanta Reddy [email protected], [email protected] Cc: Opensaf-devel [email protected] Subject: RE: [PATCH 1 of 1] imm: Remove coordinator role when SC absence happens [#1692] Nice work Hung, Ack from me with one question inline. -----Original Message----- From: Hung Nguyen [mailto:[email protected]] Sent: Tuesday, March 08, 2016 6:23 AM To: Zoran Milinkovic; [email protected] Cc: [email protected] Subject: [PATCH 1 of 1] imm: Remove coordinator role when SC absence happens [#1692] osaf/services/saf/immsv/immnd/immnd_evt.c | 37 ++++++++++++++++++++--------- osaf/services/saf/immsv/immnd/immnd_proc.c | 34 +++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 15 deletions(-) Set 'mIsCoord' to false when headless to avoid coordinator from restarting. Also handle the cases when headless occurs before/during sync. diff --git a/osaf/services/saf/immsv/immnd/immnd_evt.c b/osaf/services/saf/immsv/immnd/immnd_evt.c --- a/osaf/services/saf/immsv/immnd/immnd_evt.c +++ b/osaf/services/saf/immsv/immnd/immnd_evt.c @@ -10194,24 +10194,38 @@ static uint32_t immnd_evt_proc_mds_evt(I } exit(1); } else { /* SC ABSENCE ALLOWED */ + cb->mIntroduced = 2; LOG_WA("SC Absence IS allowed:%u IMMD service is DOWN", cb->mScAbsenceAllowed); if(cb->mIsCoord) { - /* Note that normally the coord will reside at SCs so this branch will - only be relevant if REPEATED toal scAbsence occurs. After SC absence - and subsequent return of SC, the coord will be elected at a payload. - That coord will be active untill restart of that payload.. - unless we add functionality for the payload coord to restart after - a few minutes .. ? - */ - LOG_WA("This IMMND coord has to exit allowing restarted IMMD to select new coord"); - if(cb->mState < IMM_SERVER_SYNC_SERVER) { - immnd_ackToNid(NCSCC_RC_FAILURE); + cb->mIsCoord = false; + + if (cb->mSyncRequested) { + /* Just got sync requested from IMMD, nothing happened yet */ + cb->mSyncRequested = false; + + } else if (cb->mState == IMM_SERVER_SYNC_SERVER && cb->mPendSync) { + /* Sent out sync-start msg but sync didn't start yet, revert the state to IMM_SERVER_READY */ + cb->mPendSync = false; + cb->mState = IMM_SERVER_READY; + LOG_NO("SERVER STATE: IMM_SERVER_SYNC_SERVER --> IMM_SERVER_READY"); + + } else if (cb->mState == IMM_SERVER_SYNC_SERVER && (cb->syncPid > 0)) { + /* Sync started, kill sync process to trigger sync abort in immnd_proc_server() */ + osafassert(!cb->mPendSync); + kill(cb->syncPid, SIGTERM); } [Zoran] Shouldn't other cb->mState states be handled here ? For example IMM_SERVER_LOADING_SERVER. [Hung] Those states are not valid for coordinator. When it's coordinator, it should be either SERVER_READY or SERVER_SYNC_SERVER. I think the last 'else' should be the same as replaced 'IF' statement (cb->mState < IMM_SERVER_SYNC_SERVER). [Hung] mPendSync is set before the coordinator sends sync-start msg to IMMD. mPenSync is unset after the coordinator receives sync-start response msg from IMMD. The imm-sync process is only forked after that (mPenSync is unset). So mPenSync can't be true when syncPid is greater than zero. Thanks, Zoran - exit(1); + } else if(cb->mState <= IMM_SERVER_LOADING_PENDING) { /* Reset state in payloads that had not joined. No need to restart. */ LOG_IN("Resetting IMMND state from %u to IMM_SERVER_ANONYMOUS", cb->mState); cb->mState = IMM_SERVER_ANONYMOUS; + + } else if (cb->mState == IMM_SERVER_READY && immModel_immNotWritable(cb)) { + /* This SC absence allowed case, when IMMD is down and + The sync is in progress. Veteran nodes Other than the syncing node, + has to change the node state from NODE_R_AVAILABLE to NODE_FULLY_AVAILABLE*/ + immnd_abortSync(cb); + } else if(cb->mState < IMM_SERVER_READY) { LOG_WA("IMMND was being synced or loaded (%u), has to restart", cb->mState); if(cb->mState < IMM_SERVER_SYNC_SERVER) { @@ -10220,7 +10234,6 @@ static uint32_t immnd_evt_proc_mds_evt(I exit(1); } } - cb->mIntroduced = 2; LOG_NO("IMMD SERVICE IS DOWN, HYDRA IS CONFIGURED => UNREGISTERING IMMND form MDS"); immnd_mds_unregister(cb); /* Discard local clients ... */ diff --git a/osaf/services/saf/immsv/immnd/immnd_proc.c b/osaf/services/saf/immsv/immnd/immnd_proc.c --- a/osaf/services/saf/immsv/immnd/immnd_proc.c +++ b/osaf/services/saf/immsv/immnd/immnd_proc.c @@ -872,7 +872,7 @@ void immnd_abortSync(IMMND_CB *cb) memset(&send_evt, '\0', sizeof(IMMSV_EVT)); TRACE_ENTER(); TRACE("ME:%u RE:%u", cb->mMyEpoch, cb->mRulingEpoch); - osafassert(cb->mIsCoord); + osafassert(cb->mIsCoord || (cb->mScAbsenceAllowed && cb->mIntroduced == 2 )); cb->mPendSync = 0; if(cb->mSyncFinalizing) { cb->mSyncFinalizing = 0x0; @@ -898,6 +898,12 @@ void immnd_abortSync(IMMND_CB *cb) LOG_ER("immnd_abortSync not clean on epoch: RE:%u ME:%u", cb->mRulingEpoch, cb->mMyEpoch); } + /* Skip broadcasting sync abort msg when SC are absent */ + if (cb->mScAbsenceAllowed && cb->mIntroduced == 2) { + TRACE_LEAVE(); + return; + } + while (!immnd_is_immd_up(cb) && (retryCount++ < 20)) { LOG_WA("Coord blocked in sending ABORT_SYNC because IMMD is DOWN %u", retryCount); sleep(1); @@ -1319,6 +1325,10 @@ void immnd_proc_global_abort_ccb(IMMND_C static SaBoolT immnd_ccbsTerminated(IMMND_CB *cb, SaUint32T duration, SaBoolT* pbeImmndDeadlock) { + if (cb->mIntroduced == 2) { + /* Return true to enter phase 2 or phase 3 of SYNC_SERVER */ + return SA_TRUE; + } osafassert(cb->mIsCoord); osafassert(pbeImmndDeadlock); (*pbeImmndDeadlock) = SA_FALSE; @@ -1999,9 +2009,14 @@ uint32_t immnd_proc_server(uint32_t *tim /*Phase 2 */ if (cb->syncPid <= 0) { /*Fork sync-agent */ - cb->syncPid = immnd_forkSync(cb); + /* When SC are absent, we don't fork to trigger abortSync */ + if (cb->mIntroduced != 2) { + cb->syncPid = immnd_forkSync(cb); + } if (cb->syncPid <= 0) { - LOG_ER("Failed to fork sync process"); + if (cb->mIntroduced != 2) { + LOG_ER("Failed to fork sync process"); + } cb->syncPid = 0; cb->mStep = 0; cb->mJobStart = now; @@ -2063,6 +2078,19 @@ uint32_t immnd_proc_server(uint32_t *tim if(cb->mIntroduced == 2) { immnd_introduceMe(cb); + if(cb->pbePid > 0) { + /* Check if pbe process is terminated. + * Will send SIGKILL if it's not terminated. */ + int status = 0; + if (waitpid(cb->pbePid, &status, WNOHANG) > 0) { + cb->pbePid = 0; + LOG_NO("PBE has terminated due to SC absence"); + } else { + cb->pbePid = 0; + LOG_WA("SC were absent and PBE appears hung, sending SIGKILL"); + kill(cb->pbePid, SIGKILL); + } + } break; } ------------------------------------------------------------------------------ Transform Data into Opportunity. Accelerate data analysis in your applications with Intel Data Analytics Acceleration Library. Click to learn more. http://pubads.g.doubleclick.net/gampad/clk?id=278785351&iu=/4140 _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
