Well, i think we can read from /var/run/opensaf/*.pid of anything that is a 
daemon that
is spawned by 'start_daemon', because you would know that the pid value inside 
the /var/run/opensaf/osaf*.pid file is 
written into by the 'start_daemon' lsb command and not by NID.

Hmm, NID forks our opensaf clc-cli scripts using fork_script(). The 
fork_script() returns the pid of the child which
would be the pid of the clc-cli script. 
The opensaf clc-cli scripts inturn call start_daemon which stores the pid in 
/var/run/opensaf/osaf*.pid file.

Are you referring to our clc-cli scripts becoming zombies!?

- Mathi.

----- [email protected] wrote:

> Hi Mathi,
> 
> Isn't the problem that the programs nodeinit spawns are in its turn 
> daemonized, i.e. now owned by the init process.
> But the pid from nodeinit's initial fork, service->pid, may now be a 
> zombie process?
> So  the /var/run/pid files should be used in both script, daemon and
> the 
> process case.
> /Thanks HansN
> 
> On 04/14/2015 03:02 PM, Mathivanan Naickan Palanivelu wrote:
> > Hi Hans,
> >
> > I have clarified to your comment below. It's an ACK anyways.
> >
> >> Nodeinit sends SIGKILL to parent pid returned from fork. Sending
> >> SIGABRT
> >> the child pid should be used instead.
> > Yes that's true, and a good catch. Must have been caught in the
> first version!
> >
> > You would have already guessed the reason also. But in case if you
> have not,
> > then this behaviour is seen because we marked in the nodeinit.conf,
> for NID to spawn scripts
> > by specifying "S"
> > i.e. For eg:-
> >
> /usr/local/lib/opensaf/clc-cli/osaf-rded:RDE:S:/usr/local/lib/opensaf/clc-cli/osaf-rded:12000:-6:2:1:start:stop
> >
> > Therefore the pid is that of the script and not of our executable,
> also while spawning scripts, NID also cancels all the signals for that
> process.
> >
> > If we had specified it as "D" for daemon or "E" for regular process
> then the service->pid would have
> > had the pid of that process itself.
> >
> > Thanks,
> > Mathi.
> >
> > ----- [email protected] wrote:
> >
> >> osaf/services/infrastructure/nid/nodeinit.c |  88
> >> +++++++++++++++++++++++++++++
> >>   1 files changed, 88 insertions(+), 0 deletions(-)
> >>
> >>
> >> Nodeinit sends SIGKILL to parent pid returned from fork. Sending
> >> SIGABRT
> >> the child pid should be used instead.
> >>
> >> diff --git a/osaf/services/infrastructure/nid/nodeinit.c
> >> b/osaf/services/infrastructure/nid/nodeinit.c
> >> --- a/osaf/services/infrastructure/nid/nodeinit.c
> >> +++ b/osaf/services/infrastructure/nid/nodeinit.c
> >> @@ -56,6 +56,10 @@
> >>   #include <sys/time.h>
> >>   #include <sys/resource.h>
> >>   
> >> +#include <signal.h>
> >> +#include <sys/wait.h>
> >> +#include <stdint.h>
> >> +
> >>   #include <configmake.h>
> >>   #include <rda_papi.h>
> >>   #include <logtrace.h>
> >> @@ -1084,6 +1088,58 @@ uint32_t check_process(NID_SPAWN_INFO *s
> >>    TRACE_LEAVE();
> >>   }
> >>   
> >> +
> >>
> +/****************************************************************************
> >> + * Name          : get_pid_from_file
> >>        *
> >> + *
> >>        *
> >> + * Description   : Retrieves the given service name pid.
> >>        *
> >> + *
> >>        *
> >> + * Arguments     : service name.
> >>        *
> >> + *
> >>        *
> >> + * Return Values : > 0 - process id of given service
> >>        *
> >> + *                 -1 - error, see syslog
> >>        *
> >> + *
> >>        *
> >> +
> >>
> ***************************************************************************/
> >> +static pid_t get_pid_from_file(const char* service_name)
> >> +{
> >> +  char pid_file[NAME_MAX];
> >> +
> >> +  char prog_name[40];
> >> +  char *service, *tmp;
> >> +  FILE *f;
> >> +  pid_t pid;
> >> +
> >> +  service = (char*) malloc(strlen(service_name) +1);
> >> +  strcpy(service, service_name);
> >> +  tmp = service;
> >> +  for ( ; *tmp; ++tmp) *tmp = tolower(*tmp);
> >> +
> >> +  strcpy(prog_name, "osaf");
> >> +  strcat(prog_name, service);
> >> +  free(service);
> >> +
> >> +  LOG_IN("XXXX %s", prog_name);
> >> +
> >> +  snprintf(pid_file, sizeof(pid_file), PKGPIDDIR "/%s.pid",
> >> prog_name);
> >> +
> >> +  if ((f = fopen(pid_file, "r")) == 0) {
> >> +          LOG_WA("Failed to open %s", pid_file);
> >> +          return -1;
> >> +  }
> >> +
> >> +  if (fscanf(f, "%d", &pid) == 0) {
> >> +          LOG_WA("Could not read PID from file %s", pid_file);
> >> +          return -1;
> >> +  }
> >> +
> >> +  if (fclose(f) != 0) {
> >> +          LOG_WA("Could not close file");
> >> +          return -1;
> >> +  }
> >> +
> >> +  return pid;
> >> +}
> >> +
> >>
> >>
> /****************************************************************************
> >>    * Name          : cleanup
> >>        *
> >>    *
> >>        *
> >> @@ -1108,6 +1164,38 @@ void cleanup(NID_SPAWN_INFO *service)
> >>    nid_close_ipc();
> >>    select_fd = -1;
> >>   
> >> +  pid_t w_pid;
> >> +  pid_t pid;
> >> +  int status;
> >> +  uint32_t no_of_retries = 0;
> >> +  const uint32_t MAX_NO_RETRIES = 5;
> >> +
> >> +  // get pid of current service_name instead of the parent pid
> >> +  pid = get_pid_from_file(service->serv_name);
> >> +  if (pid > 0) {
> >> +          if (check_process(service)) {
> >> +                  // send abort signal to process to generate a core dump
> >> +                  LOG_ER("Sending SIGABRT to %s, pid=%d, (parent pid=%d)",
> >> service->serv_name, pid, service->pid);
> >> +                  if (kill(pid, SIGABRT) >= 0) {
> >> +                          // wait a short period for process to exit
> >> +                          do {
> >> +                                  w_pid = waitpid(service->pid, &status, 
> >> WNOHANG);
> >> +                                  if (w_pid < 0) {
> >> +                                          if (errno == EINTR)
> >> +                                                  continue;
> >> +                                          else
> >> +                                                  break;
> >> +                                  } else if (w_pid > 0) {
> >> +                                          if (WIFEXITED(status) || 
> >> WIFSIGNALED(status)) {
> >> +                                                  break;
> >> +                                          }
> >> +                                  }
> >> +                                  sleep(1);
> >> +                          } while (++no_of_retries < MAX_NO_RETRIES);
> >> +                  }
> >> +          }
> >> +  }
> >> +  // if sending abort signal did not succeed, fallback to sigkill
> >>    if (check_process(service)) {
> >>            LOG_ER("Sending SIGKILL to %s, pid=%d", service->serv_name,
> >> service->pid);
> >>            kill(service->pid, SIGKILL);

------------------------------------------------------------------------------
BPM Camp - Free Virtual Workshop May 6th at 10am PDT/1PM EDT
Develop your own process in accordance with the BPMN 2 standard
Learn Process modeling best practices with Bonita BPM through live exercises
http://www.bonitasoft.com/be-part-of-it/events/bpm-camp-virtual- event?utm_
source=Sourceforge_BPM_Camp_5_6_15&utm_medium=email&utm_campaign=VA_SF
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to