This patch was made against 2.5.2 - I haven't tested it on the latest
CVS version.
What you get:
* gmetad starts passing a timestamp value down to the individual RRD
updating functions.
* If a timestamp isn't provided, it generates one using your
platform's POSIX-compliant time() function.
This patch may make gmetad unstable, which is why I'm posting it here
- I'd like to see what other people's experiences are with this patch.
I've tested it on my Solaris 9 (yeah, I upgraded...) front-end and
it's working beautifully so far.
Last week I had all sorts of gmetad stability problems which turned
out to be related to having a number of truncated/corrupted RRD files.
gmetad doesn't currently make allowances for this - if an RRD file
exists, it assumes it's working properly. If it doesn't exist, it
tries to create it. That's it.
I went through and deleted the truncated files by hand, but a more
elegant solution would clearly be to make gmetad a little more
RRD-aware in the long run. I'm not sure exactly how this contributed
to my gmetad crashing every 40 minutes (perhaps there's some sort of
memleak associated with an RRD*() error condition?), but the problem
popped up when I moved RRD directories and accidentally left some
truncated RRDs around, and disappeared after I removed 'em.
So stay alert, trust no one, keep your laser handy, et cetera.
diff -ruN /tmp/ganglia-monitor-core-2.5.2/gmetad/gmetad.c
ganglia-monitor-core-modified/gmetad/gmetad.c
--- /tmp/ganglia-monitor-core-2.5.2/gmetad/gmetad.c Mon Dec 9
14:43:23 2002
+++ ganglia-monitor-core-modified/gmetad/gmetad.c Mon Mar 3 09:57:26
2003
@@ -15,6 +15,7 @@
#include <sys/stat.h>
#include <unistd.h>
#include <pwd.h>
+#include <time.h>
hash_t *xml;
@@ -245,7 +246,7 @@
sprintf(sum, "%Lf", sum_of_sums[i]);
/* Save the data to a round robin database */
- if( write_data_to_rrd( NULL, NULL, (char
*)metrics[i].name, sum, num, 15) )
+ if( write_data_to_rrd( NULL, NULL, (char
*)metrics[i].name, sum, num, 15, time(0)) )
{
err_msg("Unable to write meta data to
RRDbs");
}
diff -ruN /tmp/ganglia-monitor-core-2.5.2/gmetad/gmetad.h
ganglia-monitor-core-modified/gmetad/gmetad.h
--- /tmp/ganglia-monitor-core-2.5.2/gmetad/gmetad.h Mon Dec 9
14:43:23 2002
+++ ganglia-monitor-core-modified/gmetad/gmetad.h Wed Feb 26 16:56:36
2003
@@ -53,6 +53,7 @@
g_inet_addr **sources;
long double sum[MAX_METRIC_HASH_VALUE];
unsigned int num[MAX_METRIC_HASH_VALUE];
+ long double timestamp; /* added by swagner */
int dead;
}
data_source_list_t;
diff -ruN /tmp/ganglia-monitor-core-2.5.2/gmetad/process_xml.c
ganglia-monitor-core-modified/gmetad/process_xml.c
--- /tmp/ganglia-monitor-core-2.5.2/gmetad/process_xml.c Mon Dec 9
14:43:23 2002
+++ ganglia-monitor-core-modified/gmetad/process_xml.c Wed Feb 26
17:00:39 2003
@@ -6,7 +6,7 @@
#include <ganglia/xmlparse.h>
#include <gmetad.h>
-extern int write_data_to_rrd( char *source, char *host, char *metric,
char *sum, char *num, unsigned int step);
+extern int write_data_to_rrd( char *source, char *host, char *metric,
char *sum, char *num, unsigned int step, unsigned int time_polled);
extern struct xml_tag *in_xml_list (char *, unsigned int);
extern struct ganglia_metric *in_metric_list (char *, unsigned int);
@@ -102,7 +102,7 @@
/* Save the data to a round robin database */
debug_msg("Writing Summary data for source %s",
xml_data->source);
xml_data->rval =
write_data_to_rrd(xml_data->source, NULL, (char*) metrics[i].name,
-
sum, num, xml_data->ds->step);
+
sum, num, xml_data->ds->step,xml_data->cluster_localtime);
if (xml_data->rval)
{
/* Pass the error on to save_to_rrd */
@@ -236,7 +236,7 @@
if (xml_data->grid_depth==0)
{
xml_data->rval = write_data_to_rrd( xml_data->source,
xml_data->host, xml_data->metric,
-
xml_data->metric_val, NULL, xml_data->ds->step);
+
xml_data->metric_val, NULL, xml_data->ds->step,
xml_data->cluster_localtime);
if (xml_data->rval) {
return;
}
diff -ruN /tmp/ganglia-monitor-core-2.5.2/gmetad/rrd_helpers.c
ganglia-monitor-core-modified/gmetad/rrd_helpers.c
--- /tmp/ganglia-monitor-core-2.5.2/gmetad/rrd_helpers.c Mon Dec 9
14:43:24 2002
+++ ganglia-monitor-core-modified/gmetad/rrd_helpers.c Mon Mar 3
14:54:34 2003
@@ -9,6 +9,7 @@
#include <gmetad.h>
#include <errno.h>
#include <pthread.h>
+#include <time.h>
#define PATHSIZE 4096
extern char * rrd_rootdir;
@@ -26,17 +27,22 @@
}
static int
-RRD_update( char *rrd, char *sum, char *num )
+RRD_update( char *rrd, char *sum, char *num, unsigned int
process_time )
{
char *argv[3];
int argc = 3;
char val[128];
+ /* if process_time is undefined, we set it to the current time */
+
+ if (!process_time)
+ process_time = time(0);
+
/* If we are a host RRD, we "sum" over only one host. */
if (num)
- sprintf(val, "N:%s:%s", sum, num);
+ sprintf(val, "%d:%s:%s", process_time, sum, num);
else
- sprintf(val, "N:%s", sum);
+ sprintf(val, "%d:%s", process_time, sum);
argv[0] = "dummy";
argv[1] = rrd;
@@ -108,7 +114,7 @@
/* A summary RRD has a "num" and a "sum" DS (datasource) whereas the
host rrds only have "sum" (since num is always 1) */
static int
-push_data_to_rrd( char *rrd, char *sum, char *num, unsigned int step)
+push_data_to_rrd( char *rrd, char *sum, char *num, unsigned int step,
unsigned int process_time)
{
int rval;
int summary;
@@ -125,13 +131,13 @@
if( rval )
return rval;
}
- return RRD_update( rrd, sum, num );
+ return RRD_update( rrd, sum, num, process_time );
}
/* Assumes num argument will be NULL for a host RRD. */
int
-write_data_to_rrd ( char *source, char *host, char *metric, char
*sum, char *num, unsigned int step )
+write_data_to_rrd ( char *source, char *host, char *metric, char
*sum, char *num, unsigned int step, unsigned int process_time )
{
char rrd[ PATHSIZE ];
int rval;
@@ -163,7 +169,7 @@
strncat(rrd, metric, PATHSIZE);
strncat(rrd, ".rrd", PATHSIZE);
- return push_data_to_rrd( rrd, sum, num, step );
+ return push_data_to_rrd( rrd, sum, num, step, process_time );
/* Shouldn't get here */
return 1;