Hi Hung,

I tested the patch, and the patch works, but after every time the cluster comes 
from headless state, there are osafimmloadd and sometimes osafimmpbed zombie 
processes.

This is after PL-3 twice coming from headless state.
$ ps aux | grep osaf
root      2873  0.0  0.0  29896  1852 ?        S<sl 14:10   0:00 
/usr/lib/opensaf/osaftransportd
root      2876  2.9  4.0 387144 83740 ?        SNsl 14:10   0:14 
/usr/lib/opensaf/osafimmnd
root      2885  0.1  0.1 159496  3424 ?        Ssl  14:11   0:00 
/usr/lib/opensaf/osafclmna
root      2893  0.1  0.2 384012  4532 ?        Ssl  14:11   0:00 
/usr/lib/opensaf/osafamfnd
root      2903  0.0  0.1 157396  2752 ?        Ssl  14:11   0:00 
/usr/lib/opensaf/osafamfwd
root      2912  0.0  0.2 249220  5048 ?        Ssl  14:11   0:00 
/usr/lib/opensaf/osafckptnd
root      2921  0.0  0.1 160168  2936 ?        Ssl  14:11   0:00 
/usr/lib/opensaf/osaflcknd
root      2943 76.0  1.0 199828 20720 ?        Rsl  14:11   5:54 
/usr/lib/opensaf/osafmsgnd
root      2953  0.0  0.1 159484  2796 ?        Ssl  14:11   0:00 
/usr/lib/opensaf/osafsmfnd
root      2991  0.4  0.0      0     0 ?        ZN   14:14   0:01 [osafimmpbed] 
<defunct>
root      3002  0.3  0.0      0     0 ?        ZN   14:14   0:00 [osafimmloadd] 
<defunct>
root      3018  1.2  0.0      0     0 ?        ZN   14:17   0:01 [osafimmpbed] 
<defunct>
root      3029  0.0  0.0      0     0 ?        ZN   14:17   0:00 [osafimmloadd] 
<defunct>
root      3034  2.9  0.2 169620  6060 ?        SNl  14:18   0:01 
/usr/lib/opensaf/osafimmpbed --pbe /etc/opensaf/imm.db

Thanks,
Zoran

-----Original Message-----
From: Hung Nguyen [mailto:hung.d.ngu...@dektech.com.au] 
Sent: den 10 februari 2017 08:26
To: Zoran Milinkovic <zoran.milinko...@ericsson.com>; 
reddy.neelaka...@oracle.com
Cc: opensaf-devel@lists.sourceforge.net
Subject: [PATCH 1 of 1] imm: Fix problems with removing coordinator role when 
cluster goes headless [#2296]

 src/imm/immnd/immnd_evt.c  |  33 ++++++++++++++++++++++++++++++---
 src/imm/immnd/immnd_proc.c |  22 ++--------------------
 2 files changed, 32 insertions(+), 23 deletions(-)


When SC comes back too fast, it will fail to change to SEVER_READY state 
because immnd_proc_server() is not executed.
This patch basically reverts the changes in immnd_proc_server() made by #1692
and moves them to immnd_evt_proc_mds_evt().

Also, this patch hanldes the case when D2ND_SYNC_START is received but sync 
process isn't forked yet.
That case was not handled in the patch for #1692.

diff --git a/src/imm/immnd/immnd_evt.c b/src/imm/immnd/immnd_evt.c
--- a/src/imm/immnd/immnd_evt.c
+++ b/src/imm/immnd/immnd_evt.c
@@ -10386,15 +10386,28 @@ static uint32_t immnd_evt_proc_mds_evt(I
                                        cb->mSyncRequested = false;
 
                                } else if (cb->mState == IMM_SERVER_SYNC_SERVER 
&& cb->mPendSync) {
-                                       /* Sent out sync-start msg but sync 
didn't start yet, revert the state to IMM_SERVER_READY */
+                                       /* Sent out ND2D_SYNC_START msg but 
sync didn't start yet, revert the state to IMM_SERVER_READY */
                                        cb->mPendSync = false;
                                        cb->mState = IMM_SERVER_READY;
                                        LOG_NO("SERVER STATE: 
IMM_SERVER_SYNC_SERVER --> IMM_SERVER_READY");
 
+                               } else if (cb->mState == IMM_SERVER_SYNC_SERVER 
&& (cb->syncPid <= 0)) {
+                                       /* Received D2ND_SYNC_START msg but 
sync process wasn't forked yet, revert the state to IMM_SERVER_READY */
+                                       cb->mState = IMM_SERVER_READY;
+                                       LOG_NO("SERVER STATE: 
IMM_SERVER_SYNC_SERVER --> IMM_SERVER_READY");
+                                       immnd_abortSync(cb);
+
                                } else if (cb->mState == IMM_SERVER_SYNC_SERVER 
&& (cb->syncPid > 0)) {
-                                       /* Sync started, kill sync process to 
trigger sync abort in immnd_proc_server() */
+                                       /* Sync started, force kill sync 
process */
                                        osafassert(!cb->mPendSync);
-                                       kill(cb->syncPid, SIGTERM);
+
+                                       LOG_NO("Force kill sync process and 
abort sync");
+                                       kill(cb->syncPid, SIGKILL);
+                                       cb->syncPid = 0;
+
+                                       cb->mState = IMM_SERVER_READY;
+                                       LOG_NO("SERVER STATE: 
IMM_SERVER_SYNC_SERVER --> IMM_SERVER_READY");
+                                       immnd_abortSync(cb);
                                }
 
                        } else if(cb->mState <= IMM_SERVER_LOADING_PENDING) {
@@ -10430,6 +10443,20 @@ static uint32_t immnd_evt_proc_mds_evt(I
                        LOG_ER("FAILURE IN REGISTERING IMMND WITH MDS - 
exiting");
                        exit(1);
                }
+
+               if(cb->pbePid > 0) {
+                       /* Check if pbe process is terminated.
+                        * Will send SIGKILL if it's not terminated. */
+                       int status = 0;
+                       if (waitpid(cb->pbePid, &status, WNOHANG) > 0) {
+                               LOG_NO("PBE has terminated due to SC absence");
+                       } else {
+                               LOG_WA("SC were absent and PBE appears hung, 
sending SIGKILL");
+                               kill(cb->pbePid, SIGKILL);
+                       }
+                       cb->pbePid = 0;
+               }
+
        } else if ((evt->info.mds_info.change == NCSMDS_UP) && 
(evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD)) {
                LOG_NO("IMMD service is UP ... ScAbsenseAllowed?:%u 
introduced?:%u",
                           cb->mScAbsenceAllowed, cb->mIntroduced);
diff --git a/src/imm/immnd/immnd_proc.c b/src/imm/immnd/immnd_proc.c
--- a/src/imm/immnd/immnd_proc.c
+++ b/src/imm/immnd/immnd_proc.c
@@ -2014,14 +2014,9 @@ uint32_t immnd_proc_server(uint32_t *tim
                        /*Phase 2 */
                        if (cb->syncPid <= 0) {
                                /*Fork sync-agent */
-                               /* When SC are absent, we don't fork to trigger 
abortSync */
-                               if (cb->mIntroduced != 2) {
-                                       cb->syncPid = immnd_forkSync(cb);
-                               }
+                               cb->syncPid = immnd_forkSync(cb);
                                if (cb->syncPid <= 0) {
-                                       if (cb->mIntroduced != 2) {
-                                               LOG_ER("Failed to fork sync 
process");
-                                       }
+                                       LOG_ER("Failed to fork sync process");
                                        cb->syncPid = 0;
                                        cb->mStep = 0;
                                        cb->mJobStart = now;
@@ -2083,19 +2078,6 @@ uint32_t immnd_proc_server(uint32_t *tim
 
                if(cb->mIntroduced == 2) {
                        immnd_introduceMe(cb);
-                       if(cb->pbePid > 0) {
-                               /* Check if pbe process is terminated.
-                                * Will send SIGKILL if it's not terminated. */
-                               int status = 0;
-                               if (waitpid(cb->pbePid, &status, WNOHANG) > 0) {
-                                       cb->pbePid = 0;
-                                       LOG_NO("PBE has terminated due to SC 
absence");
-                               } else {
-                                       LOG_WA("SC were absent and PBE appears 
hung, sending SIGKILL");
-                                       kill(cb->pbePid, SIGKILL);
-                                       cb->pbePid = 0;
-                               }
-                       }
                        break;
                }
 

------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to