Steve,

I echo Matt in saying this is a great patch. Much better to use the time the cluster reports, and why make an extra system call if we dont have to?

I changed the logic in the RRD update functions in gmetad 2.5.2 from version 2.5.0. There are actually fewer of them in 2.5.2 (about half as many), which I hoped would make changes like yours easier to write. I'm glad to see your change could be done so cleanly.

I have fixed an important hash_foreach() bug in gmond, and I think I have found another one in gmetad (working on it right now - has to do with trying to hash gmetric keys that gperf hash does not expect). As Matt suggested, your patch needs to get out. Along with the bug/metric fixes, it makes a good reason to release 2.5.3 soon.

Again, great patch.
Federico


On Monday, March 3, 2003, at 03:49 PM, Steven Wagner wrote:

This patch was made against 2.5.2 - I haven't tested it on the latest CVS version.

What you get:

* gmetad starts passing a timestamp value down to the individual RRD updating functions. * If a timestamp isn't provided, it generates one using your platform's POSIX-compliant time() function.

This patch may make gmetad unstable, which is why I'm posting it here - I'd like to see what other people's experiences are with this patch. I've tested it on my Solaris 9 (yeah, I upgraded...) front-end and it's working beautifully so far.

Last week I had all sorts of gmetad stability problems which turned out to be related to having a number of truncated/corrupted RRD files. gmetad doesn't currently make allowances for this - if an RRD file exists, it assumes it's working properly. If it doesn't exist, it tries to create it. That's it.

I went through and deleted the truncated files by hand, but a more elegant solution would clearly be to make gmetad a little more RRD-aware in the long run. I'm not sure exactly how this contributed to my gmetad crashing every 40 minutes (perhaps there's some sort of memleak associated with an RRD*() error condition?), but the problem popped up when I moved RRD directories and accidentally left some truncated RRDs around, and disappeared after I removed 'em.

So stay alert, trust no one, keep your laser handy, et cetera.

diff -ruN /tmp/ganglia-monitor-core-2.5.2/gmetad/gmetad.c ganglia-monitor-core-modified/gmetad/gmetad.c --- /tmp/ganglia-monitor-core-2.5.2/gmetad/gmetad.c Mon Dec 9 14:43:23 2002 +++ ganglia-monitor-core-modified/gmetad/gmetad.c Mon Mar 3 09:57:26 2003
@@ -15,6 +15,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include <pwd.h>
+#include <time.h>

 hash_t *xml;

@@ -245,7 +246,7 @@
                      sprintf(sum, "%Lf", sum_of_sums[i]);

                      /* Save the data to a round robin database */
- if( write_data_to_rrd( NULL, NULL, (char *)metrics[i].name, sum, num, 15) ) + if( write_data_to_rrd( NULL, NULL, (char *)metrics[i].name, sum, num, 15, time(0)) )
                         {
err_msg("Unable to write meta data to RRDbs");
                         }
diff -ruN /tmp/ganglia-monitor-core-2.5.2/gmetad/gmetad.h ganglia-monitor-core-modified/gmetad/gmetad.h --- /tmp/ganglia-monitor-core-2.5.2/gmetad/gmetad.h Mon Dec 9 14:43:23 2002 +++ ganglia-monitor-core-modified/gmetad/gmetad.h Wed Feb 26 16:56:36 2003
@@ -53,6 +53,7 @@
       g_inet_addr **sources;
       long double  sum[MAX_METRIC_HASH_VALUE];
       unsigned int num[MAX_METRIC_HASH_VALUE];
+      long double timestamp;   /* added by swagner */
       int dead;
    }
 data_source_list_t;
diff -ruN /tmp/ganglia-monitor-core-2.5.2/gmetad/process_xml.c ganglia-monitor-core-modified/gmetad/process_xml.c --- /tmp/ganglia-monitor-core-2.5.2/gmetad/process_xml.c Mon Dec 9 14:43:23 2002 +++ ganglia-monitor-core-modified/gmetad/process_xml.c Wed Feb 26 17:00:39 2003
@@ -6,7 +6,7 @@
 #include <ganglia/xmlparse.h>
 #include <gmetad.h>

-extern int write_data_to_rrd( char *source, char *host, char *metric, char *sum, char *num, unsigned int step); +extern int write_data_to_rrd( char *source, char *host, char *metric, char *sum, char *num, unsigned int step, unsigned int time_polled);

 extern struct xml_tag *in_xml_list (char *, unsigned int);
 extern struct ganglia_metric *in_metric_list (char *, unsigned int);
@@ -102,7 +102,7 @@
                   /* Save the data to a round robin database */
debug_msg("Writing Summary data for source %s", xml_data->source); xml_data->rval = write_data_to_rrd(xml_data->source, NULL, (char*) metrics[i].name, - sum, num, xml_data->ds->step); + sum, num, xml_data->ds->step,xml_data->cluster_localtime);
                   if (xml_data->rval)
                      {
                         /* Pass the error on to save_to_rrd */
@@ -236,7 +236,7 @@
            if (xml_data->grid_depth==0)
            {
xml_data->rval = write_data_to_rrd( xml_data->source, xml_data->host, xml_data->metric, - xml_data->metric_val, NULL, xml_data->ds->step); + xml_data->metric_val, NULL, xml_data->ds->step, xml_data->cluster_localtime);
                 if (xml_data->rval)  {
                      return;
                 }
diff -ruN /tmp/ganglia-monitor-core-2.5.2/gmetad/rrd_helpers.c ganglia-monitor-core-modified/gmetad/rrd_helpers.c --- /tmp/ganglia-monitor-core-2.5.2/gmetad/rrd_helpers.c Mon Dec 9 14:43:24 2002 +++ ganglia-monitor-core-modified/gmetad/rrd_helpers.c Mon Mar 3 14:54:34 2003
@@ -9,6 +9,7 @@
 #include <gmetad.h>
 #include <errno.h>
 #include <pthread.h>
+#include <time.h>

 #define PATHSIZE 4096
 extern char * rrd_rootdir;
@@ -26,17 +27,22 @@
 }

 static int
-RRD_update( char *rrd, char *sum, char *num )
+RRD_update( char *rrd, char *sum, char *num, unsigned int process_time )
 {
    char *argv[3];
    int   argc = 3;
    char val[128];

+   /*  if process_time is undefined, we set it to the current time */
+
+   if (!process_time)
+       process_time = time(0);
+
    /* If we are a host RRD, we "sum" over only one host. */
    if (num)
-      sprintf(val, "N:%s:%s", sum, num);
+      sprintf(val, "%d:%s:%s", process_time, sum, num);
    else
-      sprintf(val, "N:%s", sum);
+      sprintf(val, "%d:%s", process_time, sum);

    argv[0] = "dummy";
    argv[1] = rrd;
@@ -108,7 +114,7 @@
 /* A summary RRD has a "num" and a "sum" DS (datasource) whereas the
    host rrds only have "sum" (since num is always 1) */
 static int
-push_data_to_rrd( char *rrd, char *sum, char *num, unsigned int step)
+push_data_to_rrd( char *rrd, char *sum, char *num, unsigned int step, unsigned int process_time)
 {
    int rval;
    int summary;
@@ -125,13 +131,13 @@
          if( rval )
             return rval;
       }
-   return RRD_update( rrd, sum, num );
+   return RRD_update( rrd, sum, num, process_time );
 }


 /* Assumes num argument will be NULL for a host RRD. */
 int
-write_data_to_rrd ( char *source, char *host, char *metric, char *sum, char *num, unsigned int step ) +write_data_to_rrd ( char *source, char *host, char *metric, char *sum, char *num, unsigned int step, unsigned int process_time )
 {
    char rrd[ PATHSIZE ];
    int rval;
@@ -163,7 +169,7 @@
    strncat(rrd, metric, PATHSIZE);
    strncat(rrd, ".rrd", PATHSIZE);

-   return push_data_to_rrd( rrd, sum, num, step );
+   return push_data_to_rrd( rrd, sum, num, step, process_time );

    /* Shouldn't get here */
    return 1;

Federico

Rocks Cluster Group, SDSC, San Diego
GPG Fingerprint: 3C5E 47E7 BDF8 C14E ED92  92BB BA86 B2E6 0390 8845


Reply via email to