This could be a very powerful feature. Although transmitting each node's
process list could be a little heavy handed as many of the processes that
run on a given node are just noise for a person that is monitoring the
progress of a particular cluster-wide job... so maybe have the 10
processes with the highest usage? But this isn't _always_ going to yield
the process one might be interested in..
All this said, I think that ganglia's strength is it's efficiency... if
every node's process list gets mcast across the network in addition to
the existing metric traffic; ganglia _might_ choke the network a bit more
than some would like.
Mike
Asaph Zemach ([EMAIL PROTECTED]) said:
> How about extending ganglia to collect ps information?
> Suppose we add to the XML something like:
>
> <!ELEMENT PROCESS EMPTY>
> <!ATTLIST PROCESS NAME CDATA #REQUIRED
> USER CDATA #REQUIRED
> PID CDATA #REQUIRED
> CPU CDATA #REQUIRED
> MEM CDATA #REQUIRED
> SZ CDATA #REQUIRED
> RSS CDATA #REQUIRED
> STATUS CDATA #REQUIRED
> ..... whatever else looks useful
> >
>
> And the per-node output would look like:
>
>
> <HOST NAME="compute-0-2" IP="10.255.255.252" REPORTED="1013270664">
> <METRIC NAME="mem_free" VAL="475380" TYPE="uint32" UNITS="KBs"
> SOURCE="gmond"/>
>
> [....]
>
> <METRIC NAME="os_release" VAL="2.4.9-13smp" TYPE="string" UNITS=""
> SOURCE="gmond"/>
>
> <PROCESS NAME="mozilla-bin" USER="asaph" PID="13845" CPU="12.4" MEM="22.3"
> SZ="62008" RSS="55352" STATUS="S" >
>
> [...]
>
> <PROCESS NAME="/bin/csh" USER="asaph" PID="13840" CPU="0.0" MEM="0.3"
> SZ="3872" RSS="2259" STATUS="S" >
> </HOST>
>
>
> We could then easily implement a cluster-wide ps utility.
>
> On the negative side, this style of implementation would
> tend to return stale information, you wouldn't want to broadcast
> this information more than once every few seconds, so anybody
> using the feature would always be seeing the state of the processes
> as they were a few seconds ago.
>
> On the plus side this gives us a bound on the bandwidth consumed
> by the cluster-wide ps function. We know that no matter how
> many people retrieve the cluster-wide ps information we will
> not consume more than N*process_list_size/sample_rate of bandwidth.
>
> Moreover, since applications running on clusters tend to be
> long lived perhaps using somewhat stale information is no
> big deal.
>
> Thoughts?
>
> Asaph
>
>
>
> On Tue, Apr 09, 2002 at 12:53:29PM -0700, matt massie wrote:
> > asaph-
> >
> > this is a much better way of collecting the metrics on linux. i like that
> > your method eliminates 3 threads and all the mutex locking. i'll
> > try out the code and likely include it in the next release.
> >
> > -matt
> >
> > Today, Asaph Zemach wrote forth saying...
> >
> > > Here iks a drop-in replacement to linux.c that does not
> > > use the extra threads and gets rid of the now-unneeded
> > > locking. It seems to work. I think it's a little cleaner
> > > and more maintainable (e.g. no forgotten locking) for the future.
> > >
> > > Decide if you want to keep it.
> > >
> > > Asaph
> > >
> > >
> > > ------------------------------------------------------------------
> > > #include <time.h>
> > > #include "ganglia.h"
> > > #include "metric_typedefs.h"
> > >
> > > /*
> > > #include "set_metric_val.h"
> > > */
> > >
> > > #define OSNAME "Linux"
> > > #define OSNAME_LEN strlen(OSNAME)
> > >
> > > /* Never changes */
> > > char proc_cpuinfo[BUFFSIZE];
> > > char proc_sys_kernel_osrelease[BUFFSIZE];
> > >
> > > typedef struct {
> > > int last_read;
> > > int thresh;
> > > char *name;
> > > char buffer[BUFFSIZE];
> > > } timely_file;
> > >
> > > timely_file proc_stat = { 0, 15, "/proc/stat" };
> > > timely_file proc_loadavg = { 0, 15, "/proc/loadavg" };
> > > timely_file proc_meminfo = { 0, 30, "/proc/meminfo" };
> > >
> > > char *update_file(timely_file *tf)
> > > {
> > > int now,rval;
> > > now = time(0);
> > > if(now - tf->last_read > tf->thresh) {
> > > rval = slurpfile(tf->name, tf->buffer, BUFFSIZE);
> > > if(rval == SYNAPSE_FAILURE) {
> > > err_msg("update_file() got an error from slurpfile() reading %s",
> > > tf->name);
> > > }
> > > else tf->last_read = now;
> > > }
> > > return tf->buffer;
> > > }
> > >
> > >
> > >
> > >
> > >
> > >
> > > /*
> > > * This function is called only once by the gmond. Use to
> > > * initialize data structures, etc or just return SYNAPSE_SUCCESS;
> > > */
> > > g_val_t
> > > metric_init(void)
> > > {
> > > g_val_t rval;
> > >
> > > rval.int32 = slurpfile("/proc/cpuinfo", proc_cpuinfo, BUFFSIZE);
> > > if ( rval.int32 == SYNAPSE_FAILURE )
> > > {
> > > err_msg("metric_init() got an error from slurpfile()
> > > /proc/cpuinfo");
> > > return rval;
> > > }
> > >
> > > rval.int32 = slurpfile( "/proc/sys/kernel/osrelease",
> > > proc_sys_kernel_osrelease, BUFFSIZE);
> > > if ( rval.int32 == SYNAPSE_FAILURE )
> > > {
> > > err_msg("kernel_func() got an error from slurpfile()");
> > > return rval;
> > > }
> > >
> > > /* Get rid of pesky \n in osrelease */
> > > proc_sys_kernel_osrelease[rval.int32-1] = '\0';
> > >
> > > rval.int32 = SYNAPSE_SUCCESS;
> > > return rval;
> > > }
> > >
> > > /*
> > > *
> > > */
> > >
> > > g_val_t
> > > cpu_num_func ( void )
> > > {
> > > FILE *f;
> > > static int cpu_num = 0;
> > > char line[80];
> > > g_val_t val;
> > >
> > > /* Only need to do this once */
> > > if (! cpu_num)
> > > {
> > > f = fopen("/proc/stat", "r");
> > > while (fscanf(f, "%s", line) != EOF)
> > > if (strncmp(line, "cpu", 3) == 0)
> > > cpu_num++;
> > > fclose(f);
> > > }
> > > val.uint16 = cpu_num - 1;
> > > return val;
> > > }
> > >
> > > g_val_t
> > > cpu_speed_func ( void )
> > > {
> > > char *p;
> > > static g_val_t val = {0};
> > >
> > > if (! val.uint32 )
> > > {
> > > p = proc_cpuinfo;
> > > p = strstr( p, "cpu MHz" );
> > > p = strchr( p, ':' );
> > > p++;
> > > p = skip_whitespace(p);
> > > val.uint32 = (uint32_t)strtol( p, (char **)NULL , 10 );
> > > }
> > > return val;
> > > }
> > >
> > > g_val_t
> > > mem_total_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > >
> > > p = strstr( update_file(&proc_meminfo), "MemTotal:");
> > > p = skip_token(p);
> > > val.uint32 = strtol( p, (char **)NULL, 10 );
> > >
> > > return val;
> > > }
> > >
> > > g_val_t
> > > swap_total_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > >
> > > p = strstr( update_file(&proc_meminfo), "SwapTotal:" );
> > > p = skip_token(p);
> > > val.uint32 = strtol( p, (char **)NULL, 10 );
> > >
> > > return val;
> > > }
> > >
> > > g_val_t
> > > boottime_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > >
> > > p = update_file(&proc_stat);
> > >
> > > p = strstr ( p, "btime" );
> > > p = skip_token ( p );
> > > val.uint32 = strtod ( p, (char **)NULL );
> > >
> > > return val;
> > > }
> > >
> > > g_val_t
> > > sys_clock_func ( void )
> > > {
> > > g_val_t val;
> > >
> > > val.uint32 = time(NULL);
> > > return val;
> > > }
> > >
> > > g_val_t
> > > machine_type_func ( void )
> > > {
> > > g_val_t val;
> > >
> > > #ifdef IA64
> > > snprintf(val.str, MAX_G_STRING_SIZE, "ia64");
> > > #endif
> > > #ifdef __i386__
> > > snprintf(val.str, MAX_G_STRING_SIZE, "x86");
> > > #endif
> > > #ifdef __alpha__
> > > snprintf(val.str, MAX_G_STRING_SIZE, "alpha");
> > > #endif
> > > return val;
> > > }
> > >
> > > g_val_t
> > > os_name_func ( void )
> > > {
> > > g_val_t val;
> > >
> > > snprintf(val.str, MAX_G_STRING_SIZE, "Linux");
> > > return val;
> > > }
> > >
> > > g_val_t
> > > os_release_func ( void )
> > > {
> > > g_val_t val;
> > >
> > > snprintf(val.str, MAX_G_STRING_SIZE, "%s", proc_sys_kernel_osrelease);
> > > return val;
> > > }
> > >
> > > /*
> > > * A helper function to return the total number of cpu jiffies
> > > */
> > > unsigned long
> > > total_jiffies_func ( void )
> > > {
> > > char *p;
> > > unsigned long user_jiffies, nice_jiffies, system_jiffies, idle_jiffies;
> > >
> > > p = update_file(&proc_stat);
> > > p = skip_token(p);
> > > p = skip_whitespace(p);
> > > user_jiffies = strtod( p, &p );
> > > p = skip_whitespace(p);
> > > nice_jiffies = strtod( p, &p );
> > > p = skip_whitespace(p);
> > > system_jiffies = strtod( p , &p );
> > > p = skip_whitespace(p);
> > > idle_jiffies = strtod( p , &p );
> > >
> > > return user_jiffies + nice_jiffies + system_jiffies + idle_jiffies;
> > > }
> > >
> > > g_val_t
> > > cpu_user_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > > static double last_user_jiffies, user_jiffies,
> > > last_total_jiffies, total_jiffies, diff;
> > >
> > > p = update_file(&proc_stat);
> > >
> > > p = skip_token(p);
> > > user_jiffies = strtod( p , (char **)NULL );
> > > total_jiffies = total_jiffies_func();
> > >
> > > diff = user_jiffies - last_user_jiffies;
> > >
> > > if ( diff )
> > > val.f = (diff/(total_jiffies - last_total_jiffies))*100;
> > > else
> > > val.f = 0.0;
> > >
> > > last_user_jiffies = user_jiffies;
> > > last_total_jiffies = total_jiffies;
> > > return val;
> > > }
> > >
> > > g_val_t
> > > cpu_nice_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > > static double last_nice_jiffies, nice_jiffies,
> > > last_total_jiffies, total_jiffies, diff;
> > >
> > > p = update_file(&proc_stat);
> > >
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > nice_jiffies = strtod( p , (char **)NULL );
> > > total_jiffies = total_jiffies_func();
> > >
> > > diff = (nice_jiffies - last_nice_jiffies);
> > >
> > > if ( diff )
> > > val.f = (diff/(total_jiffies - last_total_jiffies))*100;
> > > else
> > > val.f = 0.0;
> > >
> > > last_nice_jiffies = nice_jiffies;
> > > last_total_jiffies = total_jiffies;
> > > return val;
> > > }
> > >
> > > g_val_t
> > > cpu_system_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > > static double last_system_jiffies, system_jiffies,
> > > last_total_jiffies, total_jiffies, diff;
> > >
> > > p = update_file(&proc_stat);
> > >
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > system_jiffies = strtod( p , (char **)NULL );
> > > total_jiffies = total_jiffies_func();
> > >
> > > diff = system_jiffies - last_system_jiffies;
> > >
> > > if ( diff )
> > > val.f = (diff/(total_jiffies - last_total_jiffies))*100;
> > > else
> > > val.f = 0.0;
> > >
> > > last_system_jiffies = system_jiffies;
> > > last_total_jiffies = total_jiffies;
> > > return val;
> > > }
> > >
> > > g_val_t
> > > cpu_idle_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > > static double last_idle_jiffies, idle_jiffies,
> > > last_total_jiffies, total_jiffies, diff;
> > >
> > > p = update_file(&proc_stat);
> > >
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > idle_jiffies = strtod( p , (char **)NULL );
> > > total_jiffies = total_jiffies_func();
> > >
> > > diff = idle_jiffies - last_idle_jiffies;
> > >
> > > if ( diff )
> > > val.f = (diff/(total_jiffies - last_total_jiffies))*100;
> > > else
> > > val.f = 0.0;
> > >
> > > last_idle_jiffies = idle_jiffies;
> > > last_total_jiffies = total_jiffies;
> > > return val;
> > > }
> > >
> > > g_val_t
> > > cpu_aidle_func ( uint32_t i )
> > > {
> > > char *p;
> > > g_val_t val;
> > > double idle_jiffies, total_jiffies;
> > >
> > > p = update_file(&proc_stat);
> > >
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > idle_jiffies = strtod( p , (char **)NULL );
> > > total_jiffies = total_jiffies_func();
> > >
> > > val.f = (idle_jiffies/total_jiffies)*100;
> > > return val;
> > > }
> > >
> > > g_val_t
> > > load_one_func ( void )
> > > {
> > > g_val_t val;
> > >
> > > val.f = strtod( update_file(&proc_loadavg), (char **)NULL);
> > >
> > > return val;
> > > }
> > >
> > > g_val_t
> > > load_five_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > >
> > > p = update_file(&proc_loadavg);
> > > p = skip_token(p);
> > > val.f = strtod( p, (char **)NULL);
> > >
> > > return val;
> > > }
> > >
> > > g_val_t
> > > load_fifteen_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > >
> > > p = update_file(&proc_loadavg);
> > >
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > val.f = strtod( p, (char **)NULL);
> > >
> > > return val;
> > > }
> > >
> > > g_val_t
> > > proc_run_func( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > >
> > > p = update_file(&proc_loadavg);
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > val.uint32 = strtol( p, (char **)NULL, 10 );
> > >
> > > val.uint32--;
> > > /* This shouldn't happen.. but it might */
> > > if (val.uint32 <0)
> > > val.uint32 = 0;
> > >
> > > return val;
> > > }
> > >
> > > g_val_t
> > > proc_total_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > >
> > > p = update_file(&proc_loadavg);
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > p = skip_token(p);
> > > p = skip_whitespace(p);
> > > while ( isdigit(*p) )
> > > p++;
> > > p++; /* skip the slash-/ */
> > > val.uint32 = strtol( p, (char **)NULL, 10 );
> > >
> > > return val;
> > > }
> > >
> > > g_val_t
> > > mem_free_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > >
> > > p = strstr( update_file(&proc_meminfo), "MemFree:" );
> > > p = skip_token(p);
> > > val.uint32 = strtol( p, (char **)NULL, 10 );
> > >
> > > return val;
> > > }
> > >
> > > g_val_t
> > > mem_shared_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > >
> > > p = strstr( update_file(&proc_meminfo), "MemShared:" );
> > > p = skip_token(p);
> > > val.uint32 = strtol( p, (char **)NULL, 10 );
> > >
> > > return val;
> > > }
> > >
> > > g_val_t
> > > mem_buffers_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > >
> > > p = strstr( update_file(&proc_meminfo), "Buffers:" );
> > > p = skip_token(p);
> > > val.uint32 = strtol( p, (char **)NULL, 10 );
> > >
> > > return val;
> > > }
> > >
> > > g_val_t
> > > mem_cached_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > >
> > > p = strstr( update_file(&proc_meminfo), "Cached:");
> > > p = skip_token(p);
> > > val.uint32 = strtol( p, (char **)NULL, 10 );
> > >
> > > return val;
> > > }
> > >
> > > g_val_t
> > > swap_free_func ( void )
> > > {
> > > char *p;
> > > g_val_t val;
> > >
> > > p = strstr( update_file(&proc_meminfo), "SwapFree:" );
> > > p = skip_token(p);
> > > val.uint32 = strtol( p, (char **)NULL, 10 );
> > >
> > > return val;
> > > }
> > >
> > >
> >
> >
> > _______________________________________________
> > Ganglia-general mailing list
> > [email protected]
> > https://lists.sourceforge.net/lists/listinfo/ganglia-general
>
> _______________________________________________
> Ganglia-general mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/ganglia-general