[Ganglia-general] What's happened to my gmetad.conf and gmond.conf

Fu-Ming Tsai Wed, 26 Oct 2005 22:09:04 -0700

Hello, All
I want to use ganglia to monitor all the machine I have. And I've configured 
gmetad and gmond. However, I usaully see the only one server with gemtad and 
gmond and I can not monitor other hosts. What's wrong? The following is my 
setting on the host with gemtad and gmond.
----------------------


/*  gmetad */
authority "http://osgc01.grid.sinica.edu.tw/ganglia-webfrontend";
data_source "osgc01.grid.sinica.edu.tw Cluster" osgc01.grid.sinica.edu.tw:8644
data_source "osgs01.grid.sinica.edu.tw Cluster" osgs01.grid.sinica.edu.tw:8647
gridname "ASCC OSG"
trusted_hosts osgc01.grid.sinica.edu.tw
rrd_rootdir "/var/log/ganglia"
----------------------

/* gmond */
/* This configuration is as close to 2.5.x default behavior as possible 
  The values closely match ./gmond/metric.h definitions in 2.5.x */ 
globals {                     
 setuid = yes               
 user = nobody               
 cleanup_threshold = 300 /*secs */ 
} 

/* If a cluster attribute is specified, then all gmond hosts are wrapped 
inside 
* of a <CLUSTER> tag.  If you do not specify a cluster tag, then all <HOSTS> 
will 
* NOT be wrapped inside of a <CLUSTER> tag. */ 
cluster { 
   name = "osgc01.grid.sinica.edu.tw Cluster" 
   owner = "ASCC - ACADEMIA SINICA COMPUTING CENTRE." 
   url = "http://www.ascc.sinica.edu.tw/"; 
   latlong="N25.05 E121.32" 
} 

/* Feel free to specify as many udp_send_channels as you like.  Gmond 
  used to only support having a single channel */ 
/*udp_send_channel { 
 mcast_join = 239.2.11.71 
 port = 8649 
} */ 
udp_send_channel { 
 mcast_join =  239.2.11.71 
 port = 8644 
} 

/* You can specify as many udp_recv_channels as you like as well. */ 
/*udp_recv_channel { 
 mcast_join = 239.2.11.71 
 port = 8649 
 bind = 239.2.11.71 
} */ 

udp_recv_channel { 
 mcast_join = 239.2.11.71 
 port = 8644 
 bind = 239.2.11.71 
} 
/* You can specify as many tcp_accept_channels as you like to share 
  an xml description of the state of the cluster */ 
/*tcp_accept_channel { 
 port = 8649 
} */ 

tcp_accept_channel { 
 port = 8644 
} 

/* The old internal 2.5.x metric array has been replaced by the following 
  collection_group directives.  What follows is the default behavior for 
  collecting and sending metrics that is as close to 2.5.x behavior as 
  possible. */ 

/* This collection group will cause a heartbeat (or beacon) to be sent every 
  20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses 
  the age of the running gmond. */ 
collection_group { 
 collect_once = yes 
 time_threshold = 20 
 metric { 
   name = "heartbeat" 
 } 
} 

/* This collection group will send general info about this host every 1200 
secs. 
  This information doesn't change between reboots and is only collected once. 
*/ 
collection_group { 
 collect_once = yes 
 time_threshold = 1200 
 metric { 
   name = "cpu_num" 
 } 
 metric { 
   name = "cpu_speed" 
 } 
 metric { 
   name = "mem_total" 
 } 
 /* Should this be here? Swap can be added/removed between reboots. */ 
 metric { 
   name = "swap_total" 
 } 
 metric { 
   name = "boottime" 
 } 
 metric { 
   name = "machine_type" 
 } 
 metric { 
   name = "os_name" 
 } 
 metric { 
   name = "os_release" 
 } 
 metric { 
   name = "location" 
 } 
} 

/* This collection group will send the status of gexecd for this host every 
300 secs */ 
/* Unlike 2.5.x the default behavior is to report gexecd OFF.  */ 
collection_group { 
 collect_once = yes 
 time_threshold = 300 
 metric { 
   name = "gexec" 
 } 
} 

/* This collection group will collect the CPU status info every 20 secs. 
  The time threshold is set to 90 seconds.  In honesty, this time_threshold 
could be 
  set significantly higher to reduce unneccessary network chatter. */ 
collection_group { 
 collect_every = 20 
 time_threshold = 90 
 /* CPU status */ 
 metric { 
   name = "cpu_user"   
   value_threshold = "1.0" 
 } 
 metric { 
   name = "cpu_system"   
   value_threshold = "1.0" 
 } 
 metric { 
   name = "cpu_idle"   
   value_threshold = "5.0" 
 } 
 metric { 
   name = "cpu_nice"   
   value_threshold = "1.0" 
 } 
 metric { 
   name = "cpu_aidle" 
   value_threshold = "5.0" 
 } 
 metric { 
   name = "cpu_wio" 
   value_threshold = "1.0" 
 } 
 /* The next two metrics are optional if you want more detail... 
    ... since they are accounted for in cpu_system.   
 metric { 
   name = "cpu_intr" 
   value_threshold = "1.0" 
 } 
 metric { 
   name = "cpu_sintr" 
   value_threshold = "1.0" 
 } 
 */ 
} 

collection_group { 
 collect_every = 20 
 time_threshold = 90 
 /* Load Averages */ 
 metric { 
   name = "load_one" 
   value_threshold = "1.0" 
 } 
 metric { 
   name = "load_five" 
   value_threshold = "1.0" 
 } 
 metric { 
   name = "load_fifteen" 
   value_threshold = "1.0" 
 } 
} 

/* This group collects the number of running and total processes */ 
collection_group { 
 collect_every = 80 
 time_threshold = 950 
 metric { 
   name = "proc_run" 
   value_threshold = "1.0" 
 } 
 metric { 
   name = "proc_total" 
   value_threshold = "1.0" 
 } 
} 

/* This collection group grabs the volatile memory metrics every 40 secs and 
  sends them at least every 180 secs.  This time_threshold can be increased 
  significantly to reduce unneeded network traffic. */ 
collection_group { 
 collect_every = 40 
 time_threshold = 180 
 metric { 
   name = "mem_free" 
   value_threshold = "1024.0" 
 } 
 metric { 
   name = "mem_shared" 
   value_threshold = "1024.0" 
 } 
 metric { 
   name = "mem_buffers" 
   value_threshold = "1024.0" 
 } 
 metric { 
   name = "mem_cached" 
   value_threshold = "1024.0" 
 } 
 metric { 
   name = "swap_free" 
   value_threshold = "1024.0" 
 } 
} 

collection_group { 
 collect_every = 40 
 time_threshold = 300 
 metric { 
   name = "bytes_out" 
   value_threshold = 4096 
 } 
 metric { 
   name = "bytes_in" 
   value_threshold = 4096 
 } 
 metric { 
   name = "pkts_in" 
   value_threshold = 256 
 } 
 metric { 
   name = "pkts_out" 
   value_threshold = 256 
 } 
} 

/* Different than 2.5.x default since the old config made no sense */ 
collection_group { 
 collect_every = 1800 
 time_threshold = 3600 
 metric { 
   name = "disk_total" 
   value_threshold = 1.0 
 } 
} 

collection_group { 
 collect_every = 40 
 time_threshold = 180 
 metric { 
   name = "disk_free" 
   value_threshold = 1.0 
 } 
 metric { 
   name = "part_max_used" 
   value_threshold = 1.0 
 } 
} 
---
Acutally, the gmond.conf on the other host is the same with what I wrote 
above except for udp_send_channel, udp_recv_channel, and tcp_accept_channel, 
It's shown as following.
--------
 
/* This configuration is as close to 2.5.x default behavior as possible 
  The values closely match ./gmond/metric.h definitions in 2.5.x */ 
globals {                     
 setuid = yes               
 user = nobody               
 cleanup_threshold = 300 /*secs */ 
} 

/* If a cluster attribute is specified, then all gmond hosts are wrapped 
inside 
* of a <CLUSTER> tag.  If you do not specify a cluster tag, then all <HOSTS> 
will 
* NOT be wrapped inside of a <CLUSTER> tag. */ 
cluster { 
   name = "osgc01.grid.sinica.edu.tw Cluster" 
   owner = "ASCC - ACADEMIA SINICA COMPUTING CENTRE." 
   url = "http://www.ascc.sinica.edu.tw/"; 
   latlong="N25.05 E121.32" 
} 

/* Feel free to specify as many udp_send_channels as you like.  Gmond 
  used to only support having a single channel */ 
udp_send_channel { 
 mcast_join = 239.2.11.71 
 port = 8649 
} 
udp_send_channel { 
 mcast_join =  239.2.11.71 
 port = 8644 
} 

/* You can specify as many udp_recv_channels as you like as well. */ 
udp_recv_channel { 
 mcast_join = 239.2.11.71 
 port = 8649 
 bind = 239.2.11.71 
}

udp_recv_channel { 
 mcast_join = 239.2.11.71 
 port = 8644 
 bind = 239.2.11.71 
} 
/* You can specify as many tcp_accept_channels as you like to share 
  an xml description of the state of the cluster */ 
tcp_accept_channel { 
 port = 8649 
} 

tcp_accept_channel { 
 port = 8644 
} 

/* The old internal 2.5.x metric array has been replaced by the following 
  collection_group directives.  What follows is the default behavior for 
  collecting and sending metrics that is as close to 2.5.x behavior as 
  possible. */ 

/* This collection group will cause a heartbeat (or beacon) to be sent every 
  20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses 
  the age of the running gmond. */ 
collection_group { 
 collect_once = yes 
 time_threshold = 20 
 metric { 
   name = "heartbeat" 
 } 
} 

/* This collection group will send general info about this host every 1200 
secs. 
  This information doesn't change between reboots and is only collected once. 
*/ 
collection_group { 
 collect_once = yes 
 time_threshold = 1200 
 metric { 
   name = "cpu_num" 
 } 
 metric { 
   name = "cpu_speed" 
 } 
 metric { 
   name = "mem_total" 
 } 
 /* Should this be here? Swap can be added/removed between reboots. */ 
 metric { 
   name = "swap_total" 
 } 
 metric { 
   name = "boottime" 
 } 
 metric { 
   name = "machine_type" 
 } 
 metric { 
   name = "os_name" 
 } 
 metric { 
   name = "os_release" 
 } 
 metric { 
   name = "location" 
 } 
} 

/* This collection group will send the status of gexecd for this host every 
300 secs */ 
/* Unlike 2.5.x the default behavior is to report gexecd OFF.  */ 
collection_group { 
 collect_once = yes 
 time_threshold = 300 
 metric { 
   name = "gexec" 
 } 
} 

/* This collection group will collect the CPU status info every 20 secs. 
  The time threshold is set to 90 seconds.  In honesty, this time_threshold 
could be 
  set significantly higher to reduce unneccessary network chatter. */ 
collection_group { 
 collect_every = 20 
 time_threshold = 90 
 /* CPU status */ 
 metric { 
   name = "cpu_user"   
   value_threshold = "1.0" 
 } 
 metric { 
   name = "cpu_system"   
   value_threshold = "1.0" 
 } 
 metric { 
   name = "cpu_idle"   
   value_threshold = "5.0" 
 } 
 metric { 
   name = "cpu_nice"   
   value_threshold = "1.0" 
 } 
 metric { 
   name = "cpu_aidle" 
   value_threshold = "5.0" 
 } 
 metric { 
   name = "cpu_wio" 
   value_threshold = "1.0" 
 } 
 /* The next two metrics are optional if you want more detail... 
    ... since they are accounted for in cpu_system.   
 metric { 
   name = "cpu_intr" 
   value_threshold = "1.0" 
 } 
 metric { 
   name = "cpu_sintr" 
   value_threshold = "1.0" 
 } 
 */ 
} 

collection_group { 
 collect_every = 20 
 time_threshold = 90 
 /* Load Averages */ 
 metric { 
   name = "load_one" 
   value_threshold = "1.0" 
 } 
 metric { 
   name = "load_five" 
   value_threshold = "1.0" 
 } 
 metric { 
   name = "load_fifteen" 
   value_threshold = "1.0" 
 } 
} 

/* This group collects the number of running and total processes */ 
collection_group { 
 collect_every = 80 
 time_threshold = 950 
 metric { 
   name = "proc_run" 
   value_threshold = "1.0" 
 } 
 metric { 
   name = "proc_total" 
   value_threshold = "1.0" 
 } 
} 

/* This collection group grabs the volatile memory metrics every 40 secs and 
  sends them at least every 180 secs.  This time_threshold can be increased 
  significantly to reduce unneeded network traffic. */ 
collection_group { 
 collect_every = 40 
 time_threshold = 180 
 metric { 
   name = "mem_free" 
   value_threshold = "1024.0" 
 } 
 metric { 
   name = "mem_shared" 
   value_threshold = "1024.0" 
 } 
 metric { 
   name = "mem_buffers" 
   value_threshold = "1024.0" 
 } 
 metric { 
   name = "mem_cached" 
   value_threshold = "1024.0" 
 } 
 metric { 
   name = "swap_free" 
   value_threshold = "1024.0" 
 } 
} 

collection_group { 
 collect_every = 40 
 time_threshold = 300 
 metric { 
   name = "bytes_out" 
   value_threshold = 4096 
 } 
 metric { 
   name = "bytes_in" 
   value_threshold = 4096 
 } 
 metric { 
   name = "pkts_in" 
   value_threshold = 256 
 } 
 metric { 
   name = "pkts_out" 
   value_threshold = 256 
 } 
} 

/* Different than 2.5.x default since the old config made no sense */ 
collection_group { 
 collect_every = 1800 
 time_threshold = 3600 
 metric { 
   name = "disk_total" 
   value_threshold = 1.0 
 } 
} 

collection_group { 
 collect_every = 40 
 time_threshold = 180 
 metric { 
   name = "disk_free" 
   value_threshold = 1.0 
 } 
 metric { 
   name = "part_max_used" 
   value_threshold = 1.0 
 } 
} 
 
Could anyone help me????


BR
----------------------------------------------------------------------
"Gravitation is not responsible for people falling in love." 

Fu-Ming Tsai
Academia Sinica Computing Centre, Academia Sinica
[EMAIL PROTECTED]
------------------------------------------------------------------------

[Ganglia-general] What's happened to my gmetad.conf and gmond.conf

Reply via email to