Hi Hung, I tested the patch, and the patch works, but after every time the cluster comes from headless state, there are osafimmloadd and sometimes osafimmpbed zombie processes.
This is after PL-3 twice coming from headless state. $ ps aux | grep osaf root 2873 0.0 0.0 29896 1852 ? S<sl 14:10 0:00 /usr/lib/opensaf/osaftransportd root 2876 2.9 4.0 387144 83740 ? SNsl 14:10 0:14 /usr/lib/opensaf/osafimmnd root 2885 0.1 0.1 159496 3424 ? Ssl 14:11 0:00 /usr/lib/opensaf/osafclmna root 2893 0.1 0.2 384012 4532 ? Ssl 14:11 0:00 /usr/lib/opensaf/osafamfnd root 2903 0.0 0.1 157396 2752 ? Ssl 14:11 0:00 /usr/lib/opensaf/osafamfwd root 2912 0.0 0.2 249220 5048 ? Ssl 14:11 0:00 /usr/lib/opensaf/osafckptnd root 2921 0.0 0.1 160168 2936 ? Ssl 14:11 0:00 /usr/lib/opensaf/osaflcknd root 2943 76.0 1.0 199828 20720 ? Rsl 14:11 5:54 /usr/lib/opensaf/osafmsgnd root 2953 0.0 0.1 159484 2796 ? Ssl 14:11 0:00 /usr/lib/opensaf/osafsmfnd root 2991 0.4 0.0 0 0 ? ZN 14:14 0:01 [osafimmpbed] <defunct> root 3002 0.3 0.0 0 0 ? ZN 14:14 0:00 [osafimmloadd] <defunct> root 3018 1.2 0.0 0 0 ? ZN 14:17 0:01 [osafimmpbed] <defunct> root 3029 0.0 0.0 0 0 ? ZN 14:17 0:00 [osafimmloadd] <defunct> root 3034 2.9 0.2 169620 6060 ? SNl 14:18 0:01 /usr/lib/opensaf/osafimmpbed --pbe /etc/opensaf/imm.db Thanks, Zoran -----Original Message----- From: Hung Nguyen [mailto:hung.d.ngu...@dektech.com.au] Sent: den 10 februari 2017 08:26 To: Zoran Milinkovic <zoran.milinko...@ericsson.com>; reddy.neelaka...@oracle.com Cc: opensaf-devel@lists.sourceforge.net Subject: [PATCH 1 of 1] imm: Fix problems with removing coordinator role when cluster goes headless [#2296] src/imm/immnd/immnd_evt.c | 33 ++++++++++++++++++++++++++++++--- src/imm/immnd/immnd_proc.c | 22 ++-------------------- 2 files changed, 32 insertions(+), 23 deletions(-) When SC comes back too fast, it will fail to change to SEVER_READY state because immnd_proc_server() is not executed. This patch basically reverts the changes in immnd_proc_server() made by #1692 and moves them to immnd_evt_proc_mds_evt(). Also, this patch hanldes the case when D2ND_SYNC_START is received but sync process isn't forked yet. That case was not handled in the patch for #1692. diff --git a/src/imm/immnd/immnd_evt.c b/src/imm/immnd/immnd_evt.c --- a/src/imm/immnd/immnd_evt.c +++ b/src/imm/immnd/immnd_evt.c @@ -10386,15 +10386,28 @@ static uint32_t immnd_evt_proc_mds_evt(I cb->mSyncRequested = false; } else if (cb->mState == IMM_SERVER_SYNC_SERVER && cb->mPendSync) { - /* Sent out sync-start msg but sync didn't start yet, revert the state to IMM_SERVER_READY */ + /* Sent out ND2D_SYNC_START msg but sync didn't start yet, revert the state to IMM_SERVER_READY */ cb->mPendSync = false; cb->mState = IMM_SERVER_READY; LOG_NO("SERVER STATE: IMM_SERVER_SYNC_SERVER --> IMM_SERVER_READY"); + } else if (cb->mState == IMM_SERVER_SYNC_SERVER && (cb->syncPid <= 0)) { + /* Received D2ND_SYNC_START msg but sync process wasn't forked yet, revert the state to IMM_SERVER_READY */ + cb->mState = IMM_SERVER_READY; + LOG_NO("SERVER STATE: IMM_SERVER_SYNC_SERVER --> IMM_SERVER_READY"); + immnd_abortSync(cb); + } else if (cb->mState == IMM_SERVER_SYNC_SERVER && (cb->syncPid > 0)) { - /* Sync started, kill sync process to trigger sync abort in immnd_proc_server() */ + /* Sync started, force kill sync process */ osafassert(!cb->mPendSync); - kill(cb->syncPid, SIGTERM); + + LOG_NO("Force kill sync process and abort sync"); + kill(cb->syncPid, SIGKILL); + cb->syncPid = 0; + + cb->mState = IMM_SERVER_READY; + LOG_NO("SERVER STATE: IMM_SERVER_SYNC_SERVER --> IMM_SERVER_READY"); + immnd_abortSync(cb); } } else if(cb->mState <= IMM_SERVER_LOADING_PENDING) { @@ -10430,6 +10443,20 @@ static uint32_t immnd_evt_proc_mds_evt(I LOG_ER("FAILURE IN REGISTERING IMMND WITH MDS - exiting"); exit(1); } + + if(cb->pbePid > 0) { + /* Check if pbe process is terminated. + * Will send SIGKILL if it's not terminated. */ + int status = 0; + if (waitpid(cb->pbePid, &status, WNOHANG) > 0) { + LOG_NO("PBE has terminated due to SC absence"); + } else { + LOG_WA("SC were absent and PBE appears hung, sending SIGKILL"); + kill(cb->pbePid, SIGKILL); + } + cb->pbePid = 0; + } + } else if ((evt->info.mds_info.change == NCSMDS_UP) && (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD)) { LOG_NO("IMMD service is UP ... ScAbsenseAllowed?:%u introduced?:%u", cb->mScAbsenceAllowed, cb->mIntroduced); diff --git a/src/imm/immnd/immnd_proc.c b/src/imm/immnd/immnd_proc.c --- a/src/imm/immnd/immnd_proc.c +++ b/src/imm/immnd/immnd_proc.c @@ -2014,14 +2014,9 @@ uint32_t immnd_proc_server(uint32_t *tim /*Phase 2 */ if (cb->syncPid <= 0) { /*Fork sync-agent */ - /* When SC are absent, we don't fork to trigger abortSync */ - if (cb->mIntroduced != 2) { - cb->syncPid = immnd_forkSync(cb); - } + cb->syncPid = immnd_forkSync(cb); if (cb->syncPid <= 0) { - if (cb->mIntroduced != 2) { - LOG_ER("Failed to fork sync process"); - } + LOG_ER("Failed to fork sync process"); cb->syncPid = 0; cb->mStep = 0; cb->mJobStart = now; @@ -2083,19 +2078,6 @@ uint32_t immnd_proc_server(uint32_t *tim if(cb->mIntroduced == 2) { immnd_introduceMe(cb); - if(cb->pbePid > 0) { - /* Check if pbe process is terminated. - * Will send SIGKILL if it's not terminated. */ - int status = 0; - if (waitpid(cb->pbePid, &status, WNOHANG) > 0) { - cb->pbePid = 0; - LOG_NO("PBE has terminated due to SC absence"); - } else { - LOG_WA("SC were absent and PBE appears hung, sending SIGKILL"); - kill(cb->pbePid, SIGKILL); - cb->pbePid = 0; - } - } break; } ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, SlashDot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel