Hi Ganglia

Nodes from one particular rack/switch appear as down in ganglia. I am not
sure why - it seems communication is intermittent for some and impossible
for others. The rest of the cluster works fine.

It is a ROCKS 5.0 cluster
Most nodes have Infiniband - this rack does not.
This rack has a different kind of Gig-e switch than the rest of the cluster

Otherwise the configuration is the ROCKS install default.

compute-5-X nodes do not work properly... all others are correct - see below
for beginning of XML file.


Thanks
David

On the head end I have tried:

netstat -aunt | grep 8649:

tcp        0      0 0.0.0.0:8649                0.0.0.0:*
LISTEN
tcp        0      0 10.0.0.1:8649               10.0.0.1:54837
TIME_WAIT
tcp        0      0 127.0.0.1:8649              127.0.0.1:36321
TIME_WAIT
tcp        0      0 127.0.0.1:8649              127.0.0.1:36313
TIME_WAIT
tcp        0      0 127.0.0.1:8649              127.0.0.1:36316
TIME_WAIT
tcp        0      0 127.0.0.1:8649              127.0.0.1:36318
TIME_WAIT
udp        0      0 10.0.0.1:32772              236.126.195.85:8649
ESTABLISHED
udp        0      0 0.0.0.0:8649                0.0.0.0:*
udp   126688      0 0.0.0.0:8649                0.0.0.0:*



telnet 10.0.0.1 8649:

Trying 10.0.0.1...
Connected to hydra.local (10.0.0.1).
Escape character is '^]'.
<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
<!DOCTYPE GANGLIA_XML [
   <!ELEMENT GANGLIA_XML (GRID|CLUSTER|HOST)*>
      <!ATTLIST GANGLIA_XML VERSION CDATA #REQUIRED>
      <!ATTLIST GANGLIA_XML SOURCE CDATA #REQUIRED>
   <!ELEMENT GRID (CLUSTER | GRID | HOSTS | METRICS)*>
      <!ATTLIST GRID NAME CDATA #REQUIRED>
      <!ATTLIST GRID AUTHORITY CDATA #REQUIRED>
      <!ATTLIST GRID LOCALTIME CDATA #IMPLIED>
   <!ELEMENT CLUSTER (HOST | HOSTS | METRICS)*>
      <!ATTLIST CLUSTER NAME CDATA #REQUIRED>
      <!ATTLIST CLUSTER OWNER CDATA #IMPLIED>
      <!ATTLIST CLUSTER LATLONG CDATA #IMPLIED>
      <!ATTLIST CLUSTER URL CDATA #IMPLIED>
      <!ATTLIST CLUSTER LOCALTIME CDATA #REQUIRED>
   <!ELEMENT HOST (METRIC)*>
      <!ATTLIST HOST NAME CDATA #REQUIRED>
      <!ATTLIST HOST IP CDATA #REQUIRED>
      <!ATTLIST HOST LOCATION CDATA #IMPLIED>
      <!ATTLIST HOST REPORTED CDATA #REQUIRED>
      <!ATTLIST HOST TN CDATA #IMPLIED>
      <!ATTLIST HOST TMAX CDATA #IMPLIED>
      <!ATTLIST HOST DMAX CDATA #IMPLIED>
      <!ATTLIST HOST GMOND_STARTED CDATA #IMPLIED>
   <!ELEMENT METRIC EMPTY>
      <!ATTLIST METRIC NAME CDATA #REQUIRED>
      <!ATTLIST METRIC VAL CDATA #REQUIRED>
      <!ATTLIST METRIC TYPE (string | int8 | uint8 | int16 | uint16 | int32
| uint32 | float | double | timestamp) #REQUIRED>
      <!ATTLIST METRIC UNITS CDATA #IMPLIED>
      <!ATTLIST METRIC TN CDATA #IMPLIED>
      <!ATTLIST METRIC TMAX CDATA #IMPLIED>
      <!ATTLIST METRIC DMAX CDATA #IMPLIED>
      <!ATTLIST METRIC SLOPE (zero | positive | negative | both |
unspecified) #IMPLIED>
      <!ATTLIST METRIC SOURCE (gmond | gmetric) #REQUIRED>
   <!ELEMENT HOSTS EMPTY>
      <!ATTLIST HOSTS UP CDATA #REQUIRED>
      <!ATTLIST HOSTS DOWN CDATA #REQUIRED>
      <!ATTLIST HOSTS SOURCE (gmond | gmetric | gmetad) #REQUIRED>
   <!ELEMENT METRICS EMPTY>
      <!ATTLIST METRICS NAME CDATA #REQUIRED>
      <!ATTLIST METRICS SUM CDATA #REQUIRED>
      <!ATTLIST METRICS NUM CDATA #REQUIRED>
      <!ATTLIST METRICS TYPE (string | int8 | uint8 | int16 | uint16 | int32
| uint32 | float | double | timestamp) #REQUIRED>
      <!ATTLIST METRICS UNITS CDATA #IMPLIED>
      <!ATTLIST METRICS SLOPE (zero | positive | negative | both |
unspecified) #IMPLIED>
      <!ATTLIST METRICS SOURCE (gmond | gmetric) #REQUIRED>
]>
<GANGLIA_XML VERSION="3.0.7" SOURCE="gmond">
<CLUSTER NAME="Hydra" LOCALTIME="1217904258" OWNER="SMAST" LATLONG="N32.87
W117.22" URL="http://www.smast.umassd.edu";>
<HOST NAME="compute-2-10.local" IP="10.2.0.10" REPORTED="1217904257" TN="0"
TMAX="20" DMAX="0" LOCATION="2,10,0" GMOND_STARTED="1213646492">
<METRIC NAME="disk_total" VAL="69.962" TYPE="double" UNITS="GB" TN="0"
TMAX="1200" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_speed" VAL="3391" TYPE="uint32" UNITS="MHz" TN="0"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="part_max_used" VAL="78.3" TYPE="float" UNITS="%" TN="0"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="swap_total" VAL="2096472" TYPE="uint32" UNITS="KB" TN="0"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="sys_clock" VAL="1217904257" TYPE="uint32" UNITS="s" TN="0"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="cpu_user" VAL="99.9" TYPE="float" UNITS="%" TN="0" TMAX="90"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_system" VAL="0.1" TYPE="float" UNITS="%" TN="0" TMAX="90"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_intr" VAL="0.0" TYPE="float" UNITS="%" TN="0" TMAX="90"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_aidle" VAL="50.1" TYPE="float" UNITS="%" TN="0"
TMAX="3800" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="load_five" VAL="2.00" TYPE="float" UNITS=" " TN="0" TMAX="325"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="proc_run" VAL="2" TYPE="uint32" UNITS=" " TN="0" TMAX="950"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_free" VAL="638328" TYPE="uint32" UNITS="KB" TN="0"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_buffers" VAL="153812" TYPE="uint32" UNITS="KB" TN="0"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="swap_free" VAL="2096328" TYPE="uint32" UNITS="KB" TN="0"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="bytes_in" VAL="1769.90" TYPE="float" UNITS="bytes/sec" TN="0"
TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="pkts_out" VAL="0.73" TYPE="float" UNITS="packets/sec" TN="0"
TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_num" VAL="2" TYPE="uint16" UNITS="CPUs" TN="0" TMAX="1200"
DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="disk_free" VAL="60.285" TYPE="double" UNITS="GB" TN="0"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_total" VAL="2059552" TYPE="uint32" UNITS="KB" TN="0"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="cpu_wio" VAL="0.0" TYPE="float" UNITS="%" TN="0" TMAX="90"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="boottime" VAL="1213646443" TYPE="uint32" UNITS="s" TN="0"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="machine_type" VAL="x86_64" TYPE="string" UNITS="" TN="0"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="cpu_nice" VAL="0.0" TYPE="float" UNITS="%" TN="0" TMAX="90"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_idle" VAL="0.0" TYPE="float" UNITS="%" TN="0" TMAX="90"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_sintr" VAL="0.0" TYPE="float" UNITS="%" TN="0" TMAX="90"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="load_one" VAL="2.00" TYPE="float" UNITS=" " TN="0" TMAX="70"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="load_fifteen" VAL="2.00" TYPE="float" UNITS=" " TN="0"
TMAX="950" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="proc_total" VAL="110" TYPE="uint32" UNITS=" " TN="0"
TMAX="950" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_shared" VAL="0" TYPE="uint32" UNITS="KB" TN="0" TMAX="180"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_cached" VAL="1037368" TYPE="uint32" UNITS="KB" TN="0"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mtu" VAL="16436" TYPE="uint32" UNITS="" TN="0" TMAX="1200"
DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="bytes_out" VAL="63.52" TYPE="float" UNITS="bytes/sec" TN="0"
TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="pkts_in" VAL="9.35" TYPE="float" UNITS="packets/sec" TN="0"
TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="sample-metric" VAL="Linux compute-2-10.local
2.6.18-53.1.14.el5 #1 SMP Wed Mar 5 11:37:38 EST 2008 x86_64 x86_64 x86_64
GNU/Linux
" TYPE="string" UNITS="" TN="113" TMAX="1800" DMAX="9000" SLOPE="zero"
SOURCE="gmetric"/>
<METRIC NAME="ps-22862" VAL="cmd=fvcom, user=cchen, %cpu=99.90, %mem=1.67,
size=8500, data=24404, shared=10000, vm=104600" TYPE="string" UNITS=""
TN="17" TMAX="120" DMAX="120" SLOPE="zero" SOURCE="gmetric"/>
<METRIC NAME="rocks-version" VAL="5.0" TYPE="string" UNITS="" TN="113"
TMAX="3600" DMAX="18000" SLOPE="zero" SOURCE="gmetric"/>
<METRIC NAME="ps-22863" VAL="cmd=fvcom, user=cchen, %cpu=99.50, %mem=1.50,
size=8500, data=25028, shared=5916, vm=105168" TYPE="string" UNITS=""
TN="17" TMAX="120" DMAX="120" SLOPE="zero" SOURCE="gmetric"/>
</HOST>
<HOST NAME="compute-5-20.local" IP="10.5.0.20" REPORTED="1217750932"
TN="153325" TMAX="20" DMAX="0" LOCATION="unspecified" GMOND_STARTED="0">
</HOST>
<HOST NAME="compute-5-21.local" IP="10.5.0.21" REPORTED="1217880241"
TN="24016" TMAX="20" DMAX="0" LOCATION="5,21,0" GMOND_STARTED="1213302835">
<METRIC NAME="disk_total" VAL="71.058" TYPE="double" UNITS="GB" TN="3222962"
TMAX="1200" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_speed" VAL="3391" TYPE="uint32" UNITS="MHz" TN="3222962"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="part_max_used" VAL="65.0" TYPE="float" UNITS="%" TN="3222962"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="swap_total" VAL="1020116" TYPE="uint32" UNITS="KB"
TN="3222962" TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="sys_clock" VAL="1214681295" TYPE="uint32" UNITS="s"
TN="3222962" TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="cpu_user" VAL="0.0" TYPE="float" UNITS="%" TN="3222962"
TMAX="90" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_system" VAL="0.0" TYPE="float" UNITS="%" TN="3222962"
TMAX="90" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_intr" VAL="0.0" TYPE="float" UNITS="%" TN="3222962"
TMAX="90" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_aidle" VAL="97.9" TYPE="float" UNITS="%" TN="3222962"
TMAX="3800" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="load_five" VAL="0.02" TYPE="float" UNITS=" " TN="3222962"
TMAX="325" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="proc_run" VAL="0" TYPE="uint32" UNITS=" " TN="3222962"
TMAX="950" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_free" VAL="1377360" TYPE="uint32" UNITS="KB" TN="3222962"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_buffers" VAL="112648" TYPE="uint32" UNITS="KB"
TN="3222962" TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="swap_free" VAL="985912" TYPE="uint32" UNITS="KB" TN="3222962"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="bytes_in" VAL="13.10" TYPE="float" UNITS="bytes/sec"
TN="3222962" TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="pkts_out" VAL="0.65" TYPE="float" UNITS="packets/sec"
TN="3222962" TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_num" VAL="2" TYPE="uint16" UNITS="CPUs" TN="3222962"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="disk_free" VAL="61.328" TYPE="double" UNITS="GB" TN="3222962"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_total" VAL="2059552" TYPE="uint32" UNITS="KB" TN="3222962"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="cpu_wio" VAL="0.0" TYPE="float" UNITS="%" TN="3222962"
TMAX="90" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="boottime" VAL="1213302791" TYPE="uint32" UNITS="s"
TN="3222962" TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="machine_type" VAL="x86_64" TYPE="string" UNITS="" TN="3222962"
TMAX="1200" DMAX="0" SLOPE="zero" SOURCE="gmond"/>
<METRIC NAME="cpu_nice" VAL="0.0" TYPE="float" UNITS="%" TN="3222962"
TMAX="90" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_idle" VAL="100.0" TYPE="float" UNITS="%" TN="3222962"
TMAX="90" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="cpu_sintr" VAL="0.0" TYPE="float" UNITS="%" TN="3222962"
TMAX="90" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="load_one" VAL="0.04" TYPE="float" UNITS=" " TN="3222962"
TMAX="70" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="load_fifteen" VAL="0.00" TYPE="float" UNITS=" " TN="3222962"
TMAX="950" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="proc_total" VAL="102" TYPE="uint32" UNITS=" " TN="3222962"
TMAX="950" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_shared" VAL="0" TYPE="uint32" UNITS="KB" TN="3222962"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mem_cached" VAL="478268" TYPE="uint32" UNITS="KB" TN="3222962"
TMAX="180" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="mtu" VAL="16436" TYPE="uint32" UNITS="" TN="3222962"
TMAX="1200" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="bytes_out" VAL="52.60" TYPE="float" UNITS="bytes/sec"
TN="3222962" TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
<METRIC NAME="pkts_in" VAL="0.17" TYPE="float" UNITS="packets/sec"
TN="3222962" TMAX="300" DMAX="0" SLOPE="both" SOURCE="gmond"/>
</HOST>
-------------------------------------------------------------------------
This SF.Net email is sponsored by the Moblin Your Move Developer's challenge
Build the coolest Linux based applications with Moblin SDK & win great prizes
Grand prize is a trip for two to an Open Source event anywhere in the world
http://moblin-contest.org/redirect.php?banner_id=100&url=/
_______________________________________________
Ganglia-general mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/ganglia-general

Reply via email to