Hi Canh,

You have removed the code that makes ntf wait for ntfimcn to terminate before 
returning from CSI set callback when doing a switch over. This will probably 
work but I don't think it is a good idea.
The only change needed is to remove kill_imcnproc(SIGTERM) and 
kill_imcnproc(SIGABRT) from the timedwait_imcn_exit() function and add a
LOG_NO("%s: NTF sent SIGKILL to ntfimcn process pid [%d]); just before calling 
kill_imcnproc(SIGKILL).

Removing waiting for ntfimcn process to terminate in handle_state_ntfimcn() 
means that when doing switch over ntfimcn on the new active may start before 
(the old) ntfimcn on the to become standby node is terminated. If this happen 
the special applier is not released when the new ntfimcn wants to be applier. 
For now two special applier names exist and if ntfimcn cannot use one it will 
try to use the other one so this may still work. If the wait mechanism is kept 
it is possible to remove using two special appliers in ntfimcn so the other one 
can be used for something else or. There is also a risk that someone in the 
future removes the applier name handling without thinking about the termination 
timing problem and gets a hard to find intermittent problem.

My conclusion:
Keep the wait for termination logic and go for the simpler solution described 
above

Thanks
Lennart

> -----Original Message-----
> From: Canh Van Truong [mailto:[email protected]]
> Sent: den 9 maj 2018 10:30
> To: Lennart Lund <[email protected]>; Vu Minh Nguyen
> <[email protected]>; Minh Hon Chau
> <[email protected]>
> Cc: [email protected]; Canh Van Truong
> <[email protected]>
> Subject: [PATCH 1/1] ntf: remove sending SIGTERM and SIGABRT when killing
> osafntfimcnd [#2851]
> 
> The handler of SIGTERM is just _Exit,  so using SIGTERM/SIGABRT is not
> needed.
> The patch changes to send SIGKILL when terminating osafntfimcnd process.
> ---
>  src/ntf/ntfd/ntfs_imcnutil.c    | 136 
> +++-------------------------------------
>  src/ntf/ntfimcnd/ntfimcn_main.c |  18 ------
>  2 files changed, 8 insertions(+), 146 deletions(-)
> 
> diff --git a/src/ntf/ntfd/ntfs_imcnutil.c b/src/ntf/ntfd/ntfs_imcnutil.c
> index 00c2c0039..8e9db29ea 100644
> --- a/src/ntf/ntfd/ntfs_imcnutil.c
> +++ b/src/ntf/ntfd/ntfs_imcnutil.c
> @@ -55,27 +55,19 @@ pthread_mutex_t ntfimcn_mutex;
>   * Kill the osafntfimcn child process using previously saved Pid
>   * Error handled
>   *
> - * @param signal A valid signal
> - * @return -1 on error (kill failed
>   */
> -static void kill_imcnproc(int signal)
> +static void kill_imcnproc()
>  {
> -     int rc = 0;
> -
> -     rc = kill(ipar.pid, signal);
> +     LOG_NO("%s: Terminating osafntfimcnd (%d) process",
> +            __FUNCTION__, ipar.pid);
> +     int rc = kill(ipar.pid, SIGKILL);
>       if (rc == -1) {
>               if (errno == EPERM) {
> -                     /* We don't have permission to kill the process! */
> +                     // We don't have permission to kill the process!
>                       LOG_ER(
>                           "Child process osafntfimcn could not be killed: %s",
>                           strerror(errno));
> -             } else if (errno == ESRCH) {
> -                     /* The process was not found */
> -                     LOG_NO(
> -                         "Child process osafntfimcn could not be killed: %s",
> -                         strerror(errno));
> -             } else {
> -                     /* EINVAL Invalid or unsupported signal number */
> +             } else if (errno != ESRCH) {
>                       LOG_ER("%s Fatal error when killing %s",
> __FUNCTION__,
>                              strerror(errno));
>                       osaf_abort(0);
> @@ -83,115 +75,6 @@ static void kill_imcnproc(int signal)
>       }
>  }
> 
> -/**
> - * Timed out wait for the imcnproc to terminate.
> - *
> - * @return -1 If wait timed out
> - */
> -static int wait_imcnproc_termination(void)
> -{
> -     struct timespec wait_poll;
> -     pid_t rc_pid = (pid_t)0;
> -     int retry_cnt = 0;
> -     const int max_retry = 100;
> -     const uint64_t wait_poll_ms = 100;
> -     int status = 0;
> -     int rc = 0;
> -
> -     TRACE_ENTER();
> -     osaf_millis_to_timespec(wait_poll_ms, &wait_poll);
> -
> -     /* Wait for child process to terminate */
> -     retry_cnt = 0;
> -     do {
> -             rc_pid = waitpid(ipar.pid, &status, WNOHANG);
> -             if (rc_pid == -1) {
> -                     if (errno == EINTR) {
> -                             TRACE("\tGot EINTR continue waiting");
> -                             rc_pid = (pid_t)0;
> -                     } else if (errno == ECHILD) {
> -                             TRACE("\t Done, the process is terminated");
> -                             rc = 0;
> -                             break;
> -                     } else {
> -                             /* EINVAL Invalid option */
> -                             LOG_ER("%s Internal error %s",
> __FUNCTION__,
> -                                    strerror(errno));
> -                             osaf_abort(0);
> -                     }
> -             }
> -
> -             /* No status available. Wait and try again */
> -             retry_cnt++;
> -             if (retry_cnt <= max_retry) {
> -                     osaf_nanosleep(&wait_poll);
> -             } else {
> -                     TRACE("\tTermination timeout");
> -                     rc = -1;
> -                     break;
> -             }
> -     } while (rc_pid == 0);
> -
> -     TRACE_LEAVE2("rc = %d, retry_cnt = %d", rc, retry_cnt);
> -     return rc;
> -}
> -
> -/**
> - * Wait for imcn process to exit.
> - *  1.Send SIGTERM to the imcn process
> - *  - Wait for process to terminate
> - *    ~ Prevents "zombie" process
> - *    ~ Guaranty that process is terminated before return
> - *    If the process does not terminate after a timeout the
> - *    termination escalates with the following steps:
> - *  2. Abort signal is sent to imcn process
> - *     Wait for termination. If fail do next step.
> - *  3. Send SIGKILL
> - *     Wait for termination. If no termination is detected, ignore and
> - *     continue NTF. Something is wrong and we may end up with a 'zombie'
> - *     process but there is nothing more to do except aborting NTF causing a
> - *     node restart.
> - *
> - * Also handle the case that the process already is killed before
> - * this function is called.
> - *
> - */
> -static void timedwait_imcn_exit(void)
> -{
> -     int rc = 0;
> -     TRACE_ENTER();
> -
> -     /* Send SIGTERM for normal termination */
> -     kill_imcnproc(SIGTERM);
> -     rc = wait_imcnproc_termination();
> -     if (rc == 0) {
> -             TRACE("\tImcn successfully terminated");
> -             goto done;
> -     }
> -
> -     /* Normal termination has timed out. Escalate to step 2 */
> -     TRACE("\tNormal termination failed. Escalate to abort");
> -     kill_imcnproc(SIGABRT);
> -     rc = wait_imcnproc_termination();
> -     if (rc == 0) {
> -             TRACE("\tImcn successfully aborted");
> -             goto done;
> -     }
> -
> -     /* Abort has also timed out. Escalate to step 3 */
> -     TRACE("\tNormal termination failed. Escalate to kill");
> -     kill_imcnproc(SIGKILL);
> -     rc = wait_imcnproc_termination();
> -     if (rc == 0) {
> -             TRACE("\tImcn successfully killed");
> -     } else {
> -             LOG_WA("The osafntfimcnd process could not be killed!");
> -     }
> -
> -done:
> -     TRACE_LEAVE();
> -}
> -
>  static pid_t create_imcnprocess(SaAmfHAStateT ha_state)
>  {
>       char *start_args[3];
> @@ -329,9 +212,7 @@ void handle_state_ntfimcn(SaAmfHAStateT ha_state)
>               osaf_mutex_lock_ordie(&ntfimcn_mutex);
>               if (ha_state != ipar.ha_state) {
>                       ipar.ha_state = ha_state;
> -                     TRACE("%s: Terminating osafntfimcnd process",
> -                           __FUNCTION__);
> -                     timedwait_imcn_exit();
> +                     kill_imcnproc();
>               } else {
>                       TRACE("%s: osafntfimcnd process not restarted. "
>                             "Already in correct state",
> @@ -361,8 +242,7 @@ int stop_ntfimcn(void)
>       /* Kill ntfimcn */
>       osaf_mutex_lock_ordie(&ntfimcn_mutex);
>       ipar.ntfimcn_on = false;
> -     TRACE("%s: Terminating osafntfimcnd process", __FUNCTION__);
> -     timedwait_imcn_exit();
> +     kill_imcnproc();
>       osaf_mutex_unlock_ordie(&ntfimcn_mutex);
> 
>       /* Cancel the surveillance thread */
> diff --git a/src/ntf/ntfimcnd/ntfimcn_main.c
> b/src/ntf/ntfimcnd/ntfimcn_main.c
> index 51be3dce0..40da79c32 100644
> --- a/src/ntf/ntfimcnd/ntfimcn_main.c
> +++ b/src/ntf/ntfimcnd/ntfimcn_main.c
> @@ -78,18 +78,6 @@ static void sigusr2_handler(int sig)
>       }
>  }
> 
> -/**
> - * TERM signal handler
> - *
> - * @param sig[in]
> - */
> -static void sigterm_handler(int sig)
> -{
> -     (void)sig;
> -     signal(SIGTERM, SIG_IGN);
> -     _Exit(EXIT_SUCCESS);
> -}
> -
>  /*
>   * Exit if anything fails. This will cause ntfs to restart ntfimcn
>   */
> @@ -99,12 +87,6 @@ int main(int argc, char **argv)
>       const char *trace_label = "osafntfimcnd";
>       SaAisErrorT ais_error = SA_AIS_OK;
> 
> -     // To make sure the SIGTERM is not lost during init phase
> -     if (signal(SIGTERM, sigterm_handler) == SIG_ERR) {
> -             LOG_ER("signal TERM failed: %s", strerror(errno));
> -             _Exit(EXIT_FAILURE);
> -     }
> -
>       /*
>        * Activate Log Trace
>        */
> --
> 2.15.1


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to