Re: [Ganglia-general] gmetad mixing up nodes from different clusters

Dan Bretherton Thu, 01 Nov 2007 08:52:16 -0800

OK here goes..

Here are the sections of gmond.conf showing lines that are different from the 
default gmond.conf (which come from RPM installations I think) for all three 
clusters.


ESSC (version 2.5.7):
name "ESSC Cluster"
trusted_hosts gorgon pegasus
all_trusted on

BAS (version 2.5.7):
name "ClusterVision Cluster"

POL (version 3.0.3):
cluster { 
  name = "NEMO cluster @ POL" 
  owner = "root" 
  latlong = "unspecified" 
  url = "http://www.pol.ac.uk"; 
} 
tcp_accept_channel {
    port = 8649
    acl {
        default = "deny"
        access {
             ip = 127.0.0.1
             mask = 32
             action = "allow"
        }
        access {
             ip = <ESSC network address>
             mask = 24
             action = "allow"
        }
 access {
      ip = <POL network address>
      mask = 24
      action = "allow"
 }
    }
}

Here is the start of the XML data from POL.
<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
<!DOCTYPE GANGLIA_XML [
   <!ELEMENT GANGLIA_XML (GRID|CLUSTER|HOST)*>
      <!ATTLIST GANGLIA_XML VERSION CDATA #REQUIRED>
      <!ATTLIST GANGLIA_XML SOURCE CDATA #REQUIRED>
   <!ELEMENT GRID (CLUSTER | GRID | HOSTS | METRICS)*>
      <!ATTLIST GRID NAME CDATA #REQUIRED>
      <!ATTLIST GRID AUTHORITY CDATA #REQUIRED>
      <!ATTLIST GRID LOCALTIME CDATA #IMPLIED>
   <!ELEMENT CLUSTER (HOST | HOSTS | METRICS)*>
      <!ATTLIST CLUSTER NAME CDATA #REQUIRED>
      <!ATTLIST CLUSTER OWNER CDATA #IMPLIED>
      <!ATTLIST CLUSTER LATLONG CDATA #IMPLIED>
      <!ATTLIST CLUSTER URL CDATA #IMPLIED>
      <!ATTLIST CLUSTER LOCALTIME CDATA #REQUIRED>
   <!ELEMENT HOST (METRIC)*>
      <!ATTLIST HOST NAME CDATA #REQUIRED>
      <!ATTLIST HOST IP CDATA #REQUIRED>
      <!ATTLIST HOST LOCATION CDATA #IMPLIED>
      <!ATTLIST HOST REPORTED CDATA #REQUIRED>
      <!ATTLIST HOST TN CDATA #IMPLIED>
      <!ATTLIST HOST TMAX CDATA #IMPLIED>
      <!ATTLIST HOST DMAX CDATA #IMPLIED>
      <!ATTLIST HOST GMOND_STARTED CDATA #IMPLIED>
   <!ELEMENT METRIC EMPTY>
      <!ATTLIST METRIC NAME CDATA #REQUIRED>
      <!ATTLIST METRIC VAL CDATA #REQUIRED>
      <!ATTLIST METRIC TYPE (string | int8 | uint8 | int16 | uint16 | int32 | 
uint32 | float | double | timestamp) #REQUIRED>
      <!ATTLIST METRIC UNITS CDATA #IMPLIED>
      <!ATTLIST METRIC TN CDATA #IMPLIED>
      <!ATTLIST METRIC TMAX CDATA #IMPLIED>
      <!ATTLIST METRIC DMAX CDATA #IMPLIED>
      <!ATTLIST METRIC SLOPE (zero | positive | negative | both | unspecified) 
#IMPLIED>
      <!ATTLIST METRIC SOURCE (gmond | gmetric) #REQUIRED>
   <!ELEMENT HOSTS EMPTY>
      <!ATTLIST HOSTS UP CDATA #REQUIRED>
      <!ATTLIST HOSTS DOWN CDATA #REQUIRED>
      <!ATTLIST HOSTS SOURCE (gmond | gmetric | gmetad) #REQUIRED>
   <!ELEMENT METRICS EMPTY>
      <!ATTLIST METRICS NAME CDATA #REQUIRED>
      <!ATTLIST METRICS SUM CDATA #REQUIRED>
      <!ATTLIST METRICS NUM CDATA #REQUIRED>
      <!ATTLIST METRICS TYPE (string | int8 | uint8 | int16 | uint16 | int32 | 
uint32 | float | double | timestamp) #REQUIRED>
      <!ATTLIST METRICS UNITS CDATA #IMPLIED>
      <!ATTLIST METRICS SLOPE (zero | positive | negative | both | 
unspecified) #IMPLIED>
      <!ATTLIST METRICS SOURCE (gmond | gmetric) #REQUIRED>
]>
<GANGLIA_XML VERSION="3.0.3" SOURCE="gmond">
<CLUSTER NAME="NEMO cluster @ POL" LOCALTIME="1193931648" OWNER="root" 
LATLONG="unspecified" URL="http://www.pol.ac.uk";>
<HOST NAME="node001.beowulf.cluster" IP="10.141.0.1" REPORTED="1193931638" 
TN="9" TMAX="20" DMAX="0" LOCATION="unspecified" GMOND_STARTED="1189409909">
<METRIC NAME="disk_total" VAL="62.340" TYPE="double" UNITS="GB" TN="137" 
TMAX="1200" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_speed" VAL="2593" TYPE="uint32" UNITS="MHz" TN="136" 
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="part_max_used" VAL="36.3" TYPE="float" UNITS="" TN="125" 
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="swap_total" VAL="16322000" TYPE="uint32" UNITS="KB" TN="136" 
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="os_name" VAL="Linux" TYPE="string" UNITS="" TN="136" TMAX="1200" 
DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="cpu_user" VAL="0.0" TYPE="float" UNITS="%" TN="69" TMAX="90" 
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_system" VAL="0.0" TYPE="float" UNITS="%" TN="69" TMAX="90" 
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_aidle" VAL="99.5" TYPE="float" UNITS="%" TN="69" TMAX="3800" 
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="load_five" VAL="0.01" TYPE="float" UNITS="" TN="21" TMAX="325" 
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="proc_run" VAL="0" TYPE="uint32" UNITS="" TN="761" TMAX="950" 
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_free" VAL="4204152" TYPE="uint32" UNITS="KB" TN="9" 
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_buffers" VAL="90788" TYPE="uint32" UNITS="KB" TN="9" 
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="swap_free" VAL="16322000" TYPE="uint32" UNITS="KB" TN="9" 
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="bytes_in" VAL="1521.07" TYPE="float" UNITS="bytes/sec" TN="153" 
TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="pkts_out" VAL="0.50" TYPE="float" UNITS="packets/sec" TN="153" 
TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_num" VAL="4" TYPE="uint16" UNITS="CPUs" TN="136" TMAX="1200" 
DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="disk_free" VAL="57.743" TYPE="double" UNITS="GB" TN="125" 
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_total" VAL="8163180" TYPE="uint32" UNITS="KB" TN="136" 
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="cpu_wio" VAL="0.0" TYPE="float" UNITS="%" TN="69" TMAX="90" 
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="boottime" VAL="1189173233" TYPE="uint32" UNITS="s" TN="136" 
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="machine_type" VAL="x86_64" TYPE="string" UNITS="" TN="136" 
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="os_release" VAL="2.6.9-42.0.10.ELsmp" TYPE="string" UNITS="" 
TN="136" TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="cpu_nice" VAL="0.0" TYPE="float" UNITS="%" TN="69" TMAX="90" 
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_idle" VAL="100.0" TYPE="float" UNITS="%" TN="69" TMAX="90" 
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="load_one" VAL="0.02" TYPE="float" UNITS="" TN="21" TMAX="70" 
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="load_fifteen" VAL="0.00" TYPE="float" UNITS="" TN="21" 
TMAX="950" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="proc_total" VAL="91" TYPE="uint32" UNITS="" TN="761" TMAX="950" 
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_shared" VAL="0" TYPE="uint32" UNITS="KB" TN="9" TMAX="180" 
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_cached" VAL="3738088" TYPE="uint32" UNITS="KB" TN="9" 
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="gexec" VAL="OFF" TYPE="string" UNITS="" TN="130" TMAX="300" 
DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="bytes_out" VAL="91.22" TYPE="float" UNITS="bytes/sec" TN="153" 
TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="pkts_in" VAL="23.60" TYPE="float" UNITS="packets/sec" TN="153" 
TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
</HOST>


I hope there are some clues here!
-Dan.

On Wednesday 31 Oct 2007 19:09, Matthias Blankenhaus wrote:
> Dan,
>
> could you post the relevant snippets from gmond.conf from your cluster
> nodes ?
>
> What is the XML output from gmond on the POL cluster ?
>
> Thanx,
> Matthias
>
> On Wed, 31 Oct 2007, Dan Bretherton wrote:
> > Dear All,
> >
> > Here are some updates to the message I posted to the list yesterday:
> >
> > 1) The XML data from gmetad seems to be correct.  I got this data from
> > "telnet localhost 8651".  I can't see any incorrect nodes listed under
> > POL, so I now suspect the Web frontend rather than gmetad.
> >
> > 2) the data in /var/lib/ganglia/rrds seems to be correct.  There are no
> > incorrect nodes listed in the directory for the POL cluster.  This also
> > points the finger at the Web frontend.
> >
> > 3) I have tried out the latest versions of gmetad and the Web frontend
> > (3.0.5) with the latest version of rrdtool (1.2.23) on another computer
> > to make sure the problem is not being caused by a bug that has been
> > fixed.  I found that the same problem occurs with the latest versions so
> > I have left the public server (http://www.resc.reading.ac.uk/ganglia/) on
> > ganglia version 3.0.3 and rrdtool version 1.2.15
> >
> > 4) The BAS cluster nodes actually have different IP addresses to the POL
> > nodes of the same name, so the IP addresses are not the cause of the BAS
> > nodes being listed in the POL cluster report.
> >
> > Regards,
> > -Dan.
> >
> > On Tuesday 30 Oct 2007, you wrote:
> > > Dear All,
> > >
> > > This is the first time I have posted to the list, but I have made good
> > > use of the archives on many occasions.  Unfortunately I can't find
> > > anything in the archives to help with my current problem.
> > >
> > > I am monitoring a grid consisting of clusters at three institutions
> > > called POL, BAS and ESSC.  The clusters are all from the same supplier
> > > and use the same convention for slave node IP addresses and host names.
> > >  All the clusters are behind their own institutional firewalls.  My
> > > Ganglia Web frontend is at the following address:
> > > http://www.resc.reading.ac.uk/ganglia/
> > >
> > > My problem is that the POL cluster report mixes up nodes from all three
> > > clusters.  The POL cluster is listed as "NEMO cluster @ POL" on the
> > > grid report page of my Web frontend. There are three main problems with
> > > the POL cluster report:
> > > 1)  Nodes at ESSC and BAS with names not found at POL usually show up
> > > as blank spaces on the POL cluster page unless they are down, in which
> > > case they are represented by the usual pink box
> > > 2) The load level colouring (and hence the positioning on the page) of
> > > nodes that have the same name as nodes in other clusters is often
> > > governed by the other clusters
> > > 3) The overview section of the POL cluster report has incorrect values
> > > for load percentages and number of CPUs etc.
> > >
> > > Here is an excerpt from my gmetad.conf file showing the three data
> > > sources. The host names have been changed for security reasons.
> > >
> > > data_source "POL's gmond" 65 pol.host.name:8649
> > > data_source "ESSC's gmond" 60 essc.host.name:8649
> > > data_source "BAS's gmond through SSH tunnel" 70 localhost:8647
> > >
> > > Here is some more information I think may be relevant.
> > > -- The ESSC cluster is on the same subnet as my Web frontend server
> > > -- There are no problems with the ESSC and BAS cluster reports
> > > -- The XML data received from POL's gmond is correct
> > > -- My gmetad version is 3.0.3, but I get the same problem on my backup
> > > gmetad machine which still has version 2.5.7
> > > -- POL's gmond is version 3.0.3, but ESSC and BAS have gmond version
> > > 2.5.7 -- Accessing POL's gmond through a different port via an SSH
> > > tunnel (i.e. localhost:8648 instead of pol.host.name:8649) makes no
> > > difference -- Changing the order of the data sources in gmetad.conf
> > > makes no difference -- Removing either the ESSC or the BAS data source
> > > makes no difference; the POL cluster report still gets mixed up with
> > > the other cluster, which ever one it is
> > > -- Deleting all the RRD files in /var/lib/ganglia/rrds/ and starting
> > > again makes no difference
> > > -- The grid report page has correct values for the POL cluster
> > >
> > > I could change the host names and IP addresses of the ESSC cluster
> > > nodes, but that wouldn't stop the POL cluster report getting confused
> > > with BAS nodes and changing those clusters is not an option.  Is there
> > > any way to solve this problem without making the node names of all the
> > > clusters different?  All suggestions would be gratefully received.  I
> > > hope I haven't missed something obvious.
> > >
> > > -Dan Bretherton.
> >
> > --
> > Mr. D.A. Bretherton
> > Environmental Systems Science Centre
> > Harry Pitt Building
> > 3 Earley Gate
> > Reading University
> > Reading, RG6 6AL
> > UK
> >
> > Tel. +44 118 378 7722
> >
> > -------------------------------------------------------------------------
> > This SF.net email is sponsored by: Splunk Inc.
> > Still grepping through log files to find problems?  Stop.
> > Now Search log events and configuration files using AJAX and a browser.
> > Download your FREE copy of Splunk now >> http://get.splunk.com/
> > _______________________________________________
> > Ganglia-general mailing list
> > [email protected]
> > https://lists.sourceforge.net/lists/listinfo/ganglia-general

-- 
Mr. D.A. Bretherton
Environmental Systems Science Centre
Harry Pitt Building
3 Earley Gate
Reading University
Reading, RG6 6AL
UK

Tel. +44 118 378 7722

-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
_______________________________________________
Ganglia-general mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/ganglia-general

Re: [Ganglia-general] gmetad mixing up nodes from different clusters

Reply via email to