OK here goes..
Here are the sections of gmond.conf showing lines that are different from the
default gmond.conf (which come from RPM installations I think) for all three
clusters.
ESSC (version 2.5.7):
name "ESSC Cluster"
trusted_hosts gorgon pegasus
all_trusted on
BAS (version 2.5.7):
name "ClusterVision Cluster"
POL (version 3.0.3):
cluster {
name = "NEMO cluster @ POL"
owner = "root"
latlong = "unspecified"
url = "http://www.pol.ac.uk"
}
tcp_accept_channel {
port = 8649
acl {
default = "deny"
access {
ip = 127.0.0.1
mask = 32
action = "allow"
}
access {
ip = <ESSC network address>
mask = 24
action = "allow"
}
access {
ip = <POL network address>
mask = 24
action = "allow"
}
}
}
Here is the start of the XML data from POL.
<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
<!DOCTYPE GANGLIA_XML [
<!ELEMENT GANGLIA_XML (GRID|CLUSTER|HOST)*>
<!ATTLIST GANGLIA_XML VERSION CDATA #REQUIRED>
<!ATTLIST GANGLIA_XML SOURCE CDATA #REQUIRED>
<!ELEMENT GRID (CLUSTER | GRID | HOSTS | METRICS)*>
<!ATTLIST GRID NAME CDATA #REQUIRED>
<!ATTLIST GRID AUTHORITY CDATA #REQUIRED>
<!ATTLIST GRID LOCALTIME CDATA #IMPLIED>
<!ELEMENT CLUSTER (HOST | HOSTS | METRICS)*>
<!ATTLIST CLUSTER NAME CDATA #REQUIRED>
<!ATTLIST CLUSTER OWNER CDATA #IMPLIED>
<!ATTLIST CLUSTER LATLONG CDATA #IMPLIED>
<!ATTLIST CLUSTER URL CDATA #IMPLIED>
<!ATTLIST CLUSTER LOCALTIME CDATA #REQUIRED>
<!ELEMENT HOST (METRIC)*>
<!ATTLIST HOST NAME CDATA #REQUIRED>
<!ATTLIST HOST IP CDATA #REQUIRED>
<!ATTLIST HOST LOCATION CDATA #IMPLIED>
<!ATTLIST HOST REPORTED CDATA #REQUIRED>
<!ATTLIST HOST TN CDATA #IMPLIED>
<!ATTLIST HOST TMAX CDATA #IMPLIED>
<!ATTLIST HOST DMAX CDATA #IMPLIED>
<!ATTLIST HOST GMOND_STARTED CDATA #IMPLIED>
<!ELEMENT METRIC EMPTY>
<!ATTLIST METRIC NAME CDATA #REQUIRED>
<!ATTLIST METRIC VAL CDATA #REQUIRED>
<!ATTLIST METRIC TYPE (string | int8 | uint8 | int16 | uint16 | int32 |
uint32 | float | double | timestamp) #REQUIRED>
<!ATTLIST METRIC UNITS CDATA #IMPLIED>
<!ATTLIST METRIC TN CDATA #IMPLIED>
<!ATTLIST METRIC TMAX CDATA #IMPLIED>
<!ATTLIST METRIC DMAX CDATA #IMPLIED>
<!ATTLIST METRIC SLOPE (zero | positive | negative | both | unspecified)
#IMPLIED>
<!ATTLIST METRIC SOURCE (gmond | gmetric) #REQUIRED>
<!ELEMENT HOSTS EMPTY>
<!ATTLIST HOSTS UP CDATA #REQUIRED>
<!ATTLIST HOSTS DOWN CDATA #REQUIRED>
<!ATTLIST HOSTS SOURCE (gmond | gmetric | gmetad) #REQUIRED>
<!ELEMENT METRICS EMPTY>
<!ATTLIST METRICS NAME CDATA #REQUIRED>
<!ATTLIST METRICS SUM CDATA #REQUIRED>
<!ATTLIST METRICS NUM CDATA #REQUIRED>
<!ATTLIST METRICS TYPE (string | int8 | uint8 | int16 | uint16 | int32 |
uint32 | float | double | timestamp) #REQUIRED>
<!ATTLIST METRICS UNITS CDATA #IMPLIED>
<!ATTLIST METRICS SLOPE (zero | positive | negative | both |
unspecified) #IMPLIED>
<!ATTLIST METRICS SOURCE (gmond | gmetric) #REQUIRED>
]>
<GANGLIA_XML VERSION="3.0.3" SOURCE="gmond">
<CLUSTER NAME="NEMO cluster @ POL" LOCALTIME="1193931648" OWNER="root"
LATLONG="unspecified" URL="http://www.pol.ac.uk">
<HOST NAME="node001.beowulf.cluster" IP="10.141.0.1" REPORTED="1193931638"
TN="9" TMAX="20" DMAX="0" LOCATION="unspecified" GMOND_STARTED="1189409909">
<METRIC NAME="disk_total" VAL="62.340" TYPE="double" UNITS="GB" TN="137"
TMAX="1200" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_speed" VAL="2593" TYPE="uint32" UNITS="MHz" TN="136"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="part_max_used" VAL="36.3" TYPE="float" UNITS="" TN="125"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="swap_total" VAL="16322000" TYPE="uint32" UNITS="KB" TN="136"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="os_name" VAL="Linux" TYPE="string" UNITS="" TN="136" TMAX="1200"
DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="cpu_user" VAL="0.0" TYPE="float" UNITS="%" TN="69" TMAX="90"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_system" VAL="0.0" TYPE="float" UNITS="%" TN="69" TMAX="90"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_aidle" VAL="99.5" TYPE="float" UNITS="%" TN="69" TMAX="3800"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="load_five" VAL="0.01" TYPE="float" UNITS="" TN="21" TMAX="325"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="proc_run" VAL="0" TYPE="uint32" UNITS="" TN="761" TMAX="950"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_free" VAL="4204152" TYPE="uint32" UNITS="KB" TN="9"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_buffers" VAL="90788" TYPE="uint32" UNITS="KB" TN="9"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="swap_free" VAL="16322000" TYPE="uint32" UNITS="KB" TN="9"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="bytes_in" VAL="1521.07" TYPE="float" UNITS="bytes/sec" TN="153"
TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="pkts_out" VAL="0.50" TYPE="float" UNITS="packets/sec" TN="153"
TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_num" VAL="4" TYPE="uint16" UNITS="CPUs" TN="136" TMAX="1200"
DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="disk_free" VAL="57.743" TYPE="double" UNITS="GB" TN="125"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_total" VAL="8163180" TYPE="uint32" UNITS="KB" TN="136"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="cpu_wio" VAL="0.0" TYPE="float" UNITS="%" TN="69" TMAX="90"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="boottime" VAL="1189173233" TYPE="uint32" UNITS="s" TN="136"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="machine_type" VAL="x86_64" TYPE="string" UNITS="" TN="136"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="os_release" VAL="2.6.9-42.0.10.ELsmp" TYPE="string" UNITS=""
TN="136" TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="cpu_nice" VAL="0.0" TYPE="float" UNITS="%" TN="69" TMAX="90"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_idle" VAL="100.0" TYPE="float" UNITS="%" TN="69" TMAX="90"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="load_one" VAL="0.02" TYPE="float" UNITS="" TN="21" TMAX="70"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="load_fifteen" VAL="0.00" TYPE="float" UNITS="" TN="21"
TMAX="950" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="proc_total" VAL="91" TYPE="uint32" UNITS="" TN="761" TMAX="950"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_shared" VAL="0" TYPE="uint32" UNITS="KB" TN="9" TMAX="180"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_cached" VAL="3738088" TYPE="uint32" UNITS="KB" TN="9"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="gexec" VAL="OFF" TYPE="string" UNITS="" TN="130" TMAX="300"
DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="bytes_out" VAL="91.22" TYPE="float" UNITS="bytes/sec" TN="153"
TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="pkts_in" VAL="23.60" TYPE="float" UNITS="packets/sec" TN="153"
TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
</HOST>
I hope there are some clues here!
-Dan.
On Wednesday 31 Oct 2007 19:09, Matthias Blankenhaus wrote:
> Dan,
>
> could you post the relevant snippets from gmond.conf from your cluster
> nodes ?
>
> What is the XML output from gmond on the POL cluster ?
>
> Thanx,
> Matthias
>
> On Wed, 31 Oct 2007, Dan Bretherton wrote:
> > Dear All,
> >
> > Here are some updates to the message I posted to the list yesterday:
> >
> > 1) The XML data from gmetad seems to be correct. I got this data from
> > "telnet localhost 8651". I can't see any incorrect nodes listed under
> > POL, so I now suspect the Web frontend rather than gmetad.
> >
> > 2) the data in /var/lib/ganglia/rrds seems to be correct. There are no
> > incorrect nodes listed in the directory for the POL cluster. This also
> > points the finger at the Web frontend.
> >
> > 3) I have tried out the latest versions of gmetad and the Web frontend
> > (3.0.5) with the latest version of rrdtool (1.2.23) on another computer
> > to make sure the problem is not being caused by a bug that has been
> > fixed. I found that the same problem occurs with the latest versions so
> > I have left the public server (http://www.resc.reading.ac.uk/ganglia/) on
> > ganglia version 3.0.3 and rrdtool version 1.2.15
> >
> > 4) The BAS cluster nodes actually have different IP addresses to the POL
> > nodes of the same name, so the IP addresses are not the cause of the BAS
> > nodes being listed in the POL cluster report.
> >
> > Regards,
> > -Dan.
> >
> > On Tuesday 30 Oct 2007, you wrote:
> > > Dear All,
> > >
> > > This is the first time I have posted to the list, but I have made good
> > > use of the archives on many occasions. Unfortunately I can't find
> > > anything in the archives to help with my current problem.
> > >
> > > I am monitoring a grid consisting of clusters at three institutions
> > > called POL, BAS and ESSC. The clusters are all from the same supplier
> > > and use the same convention for slave node IP addresses and host names.
> > > All the clusters are behind their own institutional firewalls. My
> > > Ganglia Web frontend is at the following address:
> > > http://www.resc.reading.ac.uk/ganglia/
> > >
> > > My problem is that the POL cluster report mixes up nodes from all three
> > > clusters. The POL cluster is listed as "NEMO cluster @ POL" on the
> > > grid report page of my Web frontend. There are three main problems with
> > > the POL cluster report:
> > > 1) Nodes at ESSC and BAS with names not found at POL usually show up
> > > as blank spaces on the POL cluster page unless they are down, in which
> > > case they are represented by the usual pink box
> > > 2) The load level colouring (and hence the positioning on the page) of
> > > nodes that have the same name as nodes in other clusters is often
> > > governed by the other clusters
> > > 3) The overview section of the POL cluster report has incorrect values
> > > for load percentages and number of CPUs etc.
> > >
> > > Here is an excerpt from my gmetad.conf file showing the three data
> > > sources. The host names have been changed for security reasons.
> > >
> > > data_source "POL's gmond" 65 pol.host.name:8649
> > > data_source "ESSC's gmond" 60 essc.host.name:8649
> > > data_source "BAS's gmond through SSH tunnel" 70 localhost:8647
> > >
> > > Here is some more information I think may be relevant.
> > > -- The ESSC cluster is on the same subnet as my Web frontend server
> > > -- There are no problems with the ESSC and BAS cluster reports
> > > -- The XML data received from POL's gmond is correct
> > > -- My gmetad version is 3.0.3, but I get the same problem on my backup
> > > gmetad machine which still has version 2.5.7
> > > -- POL's gmond is version 3.0.3, but ESSC and BAS have gmond version
> > > 2.5.7 -- Accessing POL's gmond through a different port via an SSH
> > > tunnel (i.e. localhost:8648 instead of pol.host.name:8649) makes no
> > > difference -- Changing the order of the data sources in gmetad.conf
> > > makes no difference -- Removing either the ESSC or the BAS data source
> > > makes no difference; the POL cluster report still gets mixed up with
> > > the other cluster, which ever one it is
> > > -- Deleting all the RRD files in /var/lib/ganglia/rrds/ and starting
> > > again makes no difference
> > > -- The grid report page has correct values for the POL cluster
> > >
> > > I could change the host names and IP addresses of the ESSC cluster
> > > nodes, but that wouldn't stop the POL cluster report getting confused
> > > with BAS nodes and changing those clusters is not an option. Is there
> > > any way to solve this problem without making the node names of all the
> > > clusters different? All suggestions would be gratefully received. I
> > > hope I haven't missed something obvious.
> > >
> > > -Dan Bretherton.
> >
> > --
> > Mr. D.A. Bretherton
> > Environmental Systems Science Centre
> > Harry Pitt Building
> > 3 Earley Gate
> > Reading University
> > Reading, RG6 6AL
> > UK
> >
> > Tel. +44 118 378 7722
> >
> > -------------------------------------------------------------------------
> > This SF.net email is sponsored by: Splunk Inc.
> > Still grepping through log files to find problems? Stop.
> > Now Search log events and configuration files using AJAX and a browser.
> > Download your FREE copy of Splunk now >> http://get.splunk.com/
> > _______________________________________________
> > Ganglia-general mailing list
> > [email protected]
> > https://lists.sourceforge.net/lists/listinfo/ganglia-general
--
Mr. D.A. Bretherton
Environmental Systems Science Centre
Harry Pitt Building
3 Earley Gate
Reading University
Reading, RG6 6AL
UK
Tel. +44 118 378 7722
-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems? Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
_______________________________________________
Ganglia-general mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/ganglia-general