With this patch, opensrf will report not only the process id but also
the application name of the child processes -- both when it launches
them and when they terminate.  The libname is stored as well, for 
possible future use.

This change will make it easier for an admin to monitor the system,
especially when an application dies abnormally.

The way I incorporated the names into the messages may or may not be to
your taste, but that's easy to tinker with.

I used a doubly linked list to store the application names, as 
mentioned in my posting of August 18.

As with my earlier patch to this module, I developed the code on a 
stripped-down version of the original that I could test with, and then
back-ported it to the original.  I trust you will take it out for a
spin before committing.

Scott McKellar
http://home.swbell.net/mck9/ct/

Developer's Certificate of Origin 1.1 By making a contribution to
this project, I certify that:

(a) The contribution was created in whole or in part by me and I
have the right to submit it under the open source license indicated
in the file; or

(b) The contribution is based upon previous work that, to the best
of my knowledge, is covered under an appropriate open source license
and I have the right under that license to submit that work with
modifications, whether created in whole or in part by me, under the
same open source license (unless I am permitted to submit under a
different license), as indicated in the file; or

(c) The contribution was provided directly to me by some other person
who certified (a), (b) or (c) and I have not modified it; and

(d) In the case of each of (a), (b), or (c), I understand and agree
that this project and the contribution are public and that a record
of the contribution (including all personal information I submit
with it, including my sign-off) is maintained indefinitely and may
be redistributed consistent with this project or the open source
license indicated in the file.
*** trunk/src/libopensrf/osrf_system.c	2007-09-02 08:35:16.000000000 -0500
--- trunk-mod/src/libopensrf/osrf_system.c	2007-09-02 08:41:36.000000000 -0500
***************
*** 5,13 ****
--- 5,31 ----
  
  static int _osrfSystemInitCache( void );
  static void report_child_status( pid_t pid, int status );
+ struct child_node;
+ typedef struct child_node ChildNode;
+ 
+ struct child_node
+ {
+ 	ChildNode* pNext;
+ 	ChildNode* pPrev;
+ 	pid_t pid;
+ 	char* app;
+ 	char* libfile;
+ };
+ 
+ static ChildNode* child_list;
  
  static transport_client* osrfGlobalTransportClient = NULL;
  
+ static void add_child( pid_t pid, const char* app, const char* libfile );
+ static void delete_child( ChildNode* node );
+ static void delete_all_children( void );
+ static ChildNode* seek_child( pid_t pid );
+ 
  transport_client* osrfSystemGetTransportClient( void ) {
  	return osrfGlobalTransportClient;
  }
***************
*** 123,130 ****
  				pid_t pid;
  		
  				if( (pid = fork()) ) { 
! 					// storage pid in local table for re-launching dead children...
! 					osrfLogInfo( OSRF_LOG_MARK, "Launched application child %ld", (long) pid);
  	
  				} else {
  		
--- 141,150 ----
  				pid_t pid;
  		
  				if( (pid = fork()) ) { 
! 					// store pid in local list for re-launching dead children...
! 					add_child( pid, appname, libfile );
! 					osrfLogInfo( OSRF_LOG_MARK, "Running application child %s: process id %ld",
! 								 appname, (long) pid );
  	
  				} else {
  		
***************
*** 154,186 ****
  		}
  	}
  
  	return 0;
  }
  
  
! static void report_child_status( pid_t pid, int status ) {
  	
  	if( WIFEXITED( status ) )
  	{
  		int rc = WEXITSTATUS( status );  // return code of child process
  		if( rc )
! 			osrfLogError( OSRF_LOG_MARK, "Child process %ld exited with return code %d",
! 						  (long) pid, rc );
  		else
! 			osrfLogError( OSRF_LOG_MARK, "Child process %ld exited normally", (long) pid );
  	}
  	else if( WIFSIGNALED( status ) )
  	{
! 		osrfLogError( OSRF_LOG_MARK, "Child process %ld killed by signal %d",
! 					  (long) pid, WTERMSIG( status) );
  	}
  	else if( WIFSTOPPED( status ) )
  	{
! 		osrfLogError( OSRF_LOG_MARK, "Child process %ld stopped by signal %d",
! 					  (long) pid, (int) WSTOPSIG( status ) );
  	}
  }
  
  
  int osrf_system_bootstrap_client_resc( char* config_file, char* contextnode, char* resource ) {
  
--- 174,299 ----
  		}
  	}
  
+ 	delete_all_children();
  	return 0;
  }
  
  
! static void report_child_status( pid_t pid, int status )
! {
! 	const char* app;
! 	const char* libfile;
! 	ChildNode* node = seek_child( pid );
! 
! 	if( node ) {
! 		app     = node->app     ? node->app     : "[unknown]";
! 		libfile = node->libfile ? node->libfile : "[none]";
! 	} else
! 		app = libfile = NULL;
  	
  	if( WIFEXITED( status ) )
  	{
  		int rc = WEXITSTATUS( status );  // return code of child process
  		if( rc )
! 			osrfLogError( OSRF_LOG_MARK, "Child process %ld (app %s) exited with return code %d",
! 						  (long) pid, app, rc );
  		else
! 			osrfLogError( OSRF_LOG_MARK, "Child process %ld (app %s) exited normally",
! 						  (long) pid, app );
  	}
  	else if( WIFSIGNALED( status ) )
  	{
! 		osrfLogError( OSRF_LOG_MARK, "Child process %ld (app %s) killed by signal %d",
! 					  (long) pid, app, WTERMSIG( status) );
  	}
  	else if( WIFSTOPPED( status ) )
  	{
! 		osrfLogError( OSRF_LOG_MARK, "Child process %ld (app %s) stopped by signal %d",
! 					  (long) pid, app, (int) WSTOPSIG( status ) );
  	}
+ 
+ 	delete_child( node );
+ }
+ 
+ /*----------- Routines to manage list of children --*/
+ 
+ static void add_child( pid_t pid, const char* app, const char* libfile )
+ {
+ 	/* Construct new child node */
+ 	
+ 	ChildNode* node = safe_malloc( sizeof( ChildNode ) );
+ 
+ 	node->pid = pid;
+ 
+ 	if( app )
+ 		node->app = strdup( app );
+ 	else
+ 		node->app = NULL;
+ 
+ 	if( libfile )
+ 		node->libfile = strdup( libfile );
+ 	else
+ 		node->libfile = NULL;
+ 	
+ 	/* Add new child node to the head of the list */
+ 
+ 	node->pNext = child_list;
+ 	node->pPrev = NULL;
+ 
+ 	if( child_list )
+ 		child_list->pPrev = node;
+ 
+ 	child_list = node;
  }
  
+ static void delete_child( ChildNode* node ) {
+ 
+ 	/* Sanity check */
+ 
+ 	if( ! node )
+ 		return;
+ 	
+ 	/* Detach the node from the list */
+ 
+ 	if( node->pPrev )
+ 		node->pPrev->pNext = node->pNext;
+ 	else
+ 		child_list = node->pNext;
+ 
+ 	if( node->pNext )
+ 		node->pNext->pPrev = node->pPrev;
+ 
+ 	/* Deallocate the node and its payload */
+ 
+ 	free( node->app );
+ 	free( node->libfile );
+ 	free( node );
+ }
+ 
+ static void delete_all_children( void ) {
+ 
+ 	while( child_list )
+ 		delete_child( child_list );
+ }
+ 
+ static ChildNode* seek_child( pid_t pid ) {
+ 
+ 	/* Return a pointer to the child node for the */
+ 	/* specified process ID, or NULL if not found */
+ 	
+ 	ChildNode* node = child_list;
+ 	while( node ) {
+ 		if( node->pid == pid )
+ 			break;
+ 		else
+ 			node = node->pNext;
+ 	}
+ 
+ 	return node;
+ }
+ 
+ /*----------- End of routines to manage list of children --*/
+ 
  
  int osrf_system_bootstrap_client_resc( char* config_file, char* contextnode, char* resource ) {
  

Reply via email to