[Ganglia-general] gmond not multicasting to other nodes

David Robson Wed, 25 Jan 2006 07:50:47 -0800

Hi,


I have just installed ganglia 3.0.2 on a cluster with Fedora Core 2.
I built it OK from the source.

For my default gmon.conf file, I used the output of

gmond -t

and the updated the cluster name field.

The rest seems set up to use multicasting to share
metrics between gmond processes on different nodes.

The trouble is that it doesn't.  Telnetting to the xml
port only downloads info for the local host.  Ditto
when using gmtetad.

It looks like the gmond processes aren't sharing their metrics.

Anyone have any idea on what I am doing wrong ??

netstat shows the following established socket

udp 0 0 jac-20:38020 239.2.11.71:8649ESTABLISHED 3355/gmond

Any help gratefully accepted.  I have attached my gmond.conf file

Dave Robson

/* This configuration is as close to 2.5.x default behavior as possible 
   The values closely match ./gmond/metric.h definitions in 2.5.x */ 
globals {                    
  setuid = yes              
  user = nobody              
  cleanup_threshold = 300 /*secs */ 
  debug_level=200
} 

/* If a cluster attribute is specified, then all gmond hosts are wrapped inside 
 * of a <CLUSTER> tag.  If you do not specify a cluster tag, then all <HOSTS> 
will 
 * NOT be wrapped inside of a <CLUSTER> tag. */ 
cluster { 
  name = "My Linux Cluster" 
} 

/* The host section describes attributes of the host, like the location */ 
host { 
  location = "unspecified" 
} 

/* Feel free to specify as many udp_send_channels as you like.  Gmond 
   used to only support having a single channel */ 
udp_send_channel { 
  mcast_join = 239.2.11.71 
  port = 8649
    mcast_if= eth0
} 

/* You can specify as many udp_recv_channels as you like as well. */ 
udp_recv_channel { 
  mcast_join = 239.2.11.71 
  port = 8649
  bind = 239.2.11.71 
  mcast_if= eth0
} 

/* You can specify as many tcp_accept_channels as you like to share 
   an xml description of the state of the cluster */ 
tcp_accept_channel { 
  port = 8649 
} 


/* The old internal 2.5.x metric array has been replaced by the following 
   collection_group directives.  What follows is the default behavior for 
   collecting and sending metrics that is as close to 2.5.x behavior as 
   possible. */

/* This collection group will cause a heartbeat (or beacon) to be sent every 
   20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses 
   the age of the running gmond. */ 
collection_group { 
  collect_once = yes 
  time_threshold = 20 
  metric { 
    name = "heartbeat" 
  } 
} 

/* This collection group will send general info about this host every 1200 
secs. 
   This information doesn't change between reboots and is only collected once. 
*/ 
collection_group { 
  collect_once = yes 
  time_threshold = 1200 
  metric { 
    name = "cpu_num" 
  } 
  metric { 
    name = "cpu_speed" 
  } 
  metric { 
    name = "mem_total" 
  } 
  /* Should this be here? Swap can be added/removed between reboots. */ 
  metric { 
    name = "swap_total" 
  } 
  metric { 
    name = "boottime" 
  } 
  metric { 
    name = "machine_type" 
  } 
  metric { 
    name = "os_name" 
  } 
  metric { 
    name = "os_release" 
  } 
  metric { 
    name = "location" 
  } 
} 

/* This collection group will send the status of gexecd for this host every 300 
secs */
/* Unlike 2.5.x the default behavior is to report gexecd OFF.  */ 
collection_group { 
  collect_once = yes 
  time_threshold = 300 
  metric { 
    name = "gexec" 
  } 
} 

/* This collection group will collect the CPU status info every 20 secs. 
   The time threshold is set to 90 seconds.  In honesty, this time_threshold 
could be 
   set significantly higher to reduce unneccessary network chatter. */ 
collection_group { 
  collect_every = 20 
  time_threshold = 90 
  /* CPU status */ 
  metric { 
    name = "cpu_user"  
    value_threshold = "1.0" 
  } 
  metric { 
    name = "cpu_system"   
    value_threshold = "1.0" 
  } 
  metric { 
    name = "cpu_idle"  
    value_threshold = "5.0" 
  } 
  metric { 
    name = "cpu_nice"  
    value_threshold = "1.0" 
  } 
  metric { 
    name = "cpu_aidle" 
    value_threshold = "5.0" 
  } 
  metric { 
    name = "cpu_wio" 
    value_threshold = "1.0" 
  } 
  /* The next two metrics are optional if you want more detail... 
     ... since they are accounted for in cpu_system.  
  metric { 
    name = "cpu_intr" 
    value_threshold = "1.0" 
  } 
  metric { 
    name = "cpu_sintr" 
    value_threshold = "1.0" 
  } 
  */ 
} 

collection_group { 
  collect_every = 20 
  time_threshold = 90 
  /* Load Averages */ 
  metric { 
    name = "load_one" 
    value_threshold = "1.0" 
  } 
  metric { 
    name = "load_five" 
    value_threshold = "1.0" 
  } 
  metric { 
    name = "load_fifteen" 
    value_threshold = "1.0" 
  }
} 

/* This group collects the number of running and total processes */ 
collection_group { 
  collect_every = 80 
  time_threshold = 950 
  metric { 
    name = "proc_run" 
    value_threshold = "1.0" 
  } 
  metric { 
    name = "proc_total" 
    value_threshold = "1.0" 
  } 
}

/* This collection group grabs the volatile memory metrics every 40 secs and 
   sends them at least every 180 secs.  This time_threshold can be increased 
   significantly to reduce unneeded network traffic. */ 
collection_group { 
  collect_every = 40 
  time_threshold = 180 
  metric { 
    name = "mem_free" 
    value_threshold = "1024.0" 
  } 
  metric { 
    name = "mem_shared" 
    value_threshold = "1024.0" 
  } 
  metric { 
    name = "mem_buffers" 
    value_threshold = "1024.0" 
  } 
  metric { 
    name = "mem_cached" 
    value_threshold = "1024.0" 
  } 
  metric { 
    name = "swap_free" 
    value_threshold = "1024.0" 
  } 
} 

collection_group { 
  collect_every = 40 
  time_threshold = 300 
  metric { 
    name = "bytes_out" 
    value_threshold = 4096 
  } 
  metric { 
    name = "bytes_in" 
    value_threshold = 4096 
  } 
  metric { 
    name = "pkts_in" 
    value_threshold = 256 
  } 
  metric { 
    name = "pkts_out" 
    value_threshold = 256 
  } 
}

/* Different than 2.5.x default since the old config made no sense */ 
collection_group { 
  collect_every = 1800 
  time_threshold = 3600 
  metric { 
    name = "disk_total" 
    value_threshold = 1.0 
  } 
}

collection_group { 
  collect_every = 40 
  time_threshold = 180 
  metric { 
    name = "disk_free" 
    value_threshold = 1.0 
  } 
  metric { 
    name = "part_max_used" 
    value_threshold = 1.0 
  } 
}

[Ganglia-general] gmond not multicasting to other nodes

Reply via email to