Hello Mailing list,

This issue has been bothering me for quite some time, I'm getting a high
number of stale passive check alerts. It seems like some passive checks are
not being processed. I currently have 6596 incoming passive checks every 5
minutes. The rest of the relevant configuration are as follows:

define service{
       name                            template_passive
       active_checks_enabled           0
       passive_checks_enabled          1
       parallelize_check               0
       obsess_over_service             0
       check_freshness                 1
       freshness_threshold             600
       check_command                   check_stale_passive
       notifications_enabled           1
       event_handler_enabled           0
       flap_detection_enabled          1
       failure_prediction_enabled      0
       process_perf_data               0
       retain_status_information       1
       retain_nonstatus_information    1
       is_volatile                     0
       check_period                    24x7
       max_check_attempts              1
       normal_check_interval           1
       retry_check_interval            1
       contact_groups                  admin
       notification_options            c
       notification_interval           0
       notification_period             24x7
       register                        0
       }

# nagios.cfg
max_check_result_reaper_time=15
check_result_reaper_frequency=5
service_freshness_check_interval=780
host_freshness_check_interval=90
status_update_interval=20
check_external_commands=1
command_check_interval=-1
external_command_buffer_slots=8192
event_broker_options=-1
use_syslog=0
log_notifications=1
log_service_retries=1
log_host_retries=1
log_event_handlers=1
log_initial_states=0
log_external_commands=1
log_passive_checks=1
max_service_check_spread=30
max_host_check_spread=30
max_concurrent_checks=0
max_check_result_file_age=3600
cached_host_check_horizon=15
cached_service_check_horizon=15
enable_predictive_host_dependency_checks=1
enable_predictive_service_dependency_checks=1
soft_state_dependencies=0
auto_reschedule_checks=0
auto_rescheduling_interval=30
auto_rescheduling_window=180
sleep_time=0.125
service_check_timeout=60
host_check_timeout=30
event_handler_timeout=30
notification_timeout=30
ocsp_timeout=5
perfdata_timeout=5
retain_state_information=1
retention_update_interval=60
use_retained_program_state=0
use_retained_scheduling_info=1
retained_host_attribute_mask=0
retained_service_attribute_mask=0
retained_process_host_attribute_mask=0
retained_process_service_attribute_mask=0
retained_contact_host_attribute_mask=0
retained_contact_service_attribute_mask=0
interval_length=60
use_aggressive_host_checking=0
execute_service_checks=1
accept_passive_service_checks=1
execute_host_checks=1
accept_passive_host_checks=1
enable_notifications=1
enable_event_handlers=1
process_performance_data=0
obsess_over_services=0
obsess_over_hosts=0
translate_passive_host_checks=0
passive_host_checks_are_soft=0
check_for_orphaned_services=1
check_for_orphaned_hosts=1
check_service_freshness=1
check_host_freshness=1
additional_freshness_latency=15
enable_flap_detection=1
low_service_flap_threshold=5.0
high_service_flap_threshold=20.0
low_host_flap_threshold=5.0
high_host_flap_threshold=20.0
p1_file=/usr/local/nagios/sbin/p1.pl
enable_embedded_perl=1
use_embedded_perl_implicitly=1
use_regexp_matching=1
use_true_regexp_matching=0
daemon_dumps_core=0
use_large_installation_tweaks=1
enable_environment_macros=0
free_child_process_memory=0
child_processes_fork_twice=0
debug_level=0
debug_verbosity=1
max_debug_file_size=1000000


My current situation: nagios miss/fails to process approximately an average
of 600 out of 6596 passive check results every 5 mins.

I admint I don't know nagios that well, I started installing/using nagios
only recently, and I don't know where/how to start troubleshooting this. I
did install mrtg and did a good amount of trial and error with the config,
especially max_check_result_reaper_time and check_result_reaper_frequency,
but increasing or decreasing the values of these variables only worsens the
current situation.

However, this pstree output looks like a qualified starting point:


[EMAIL PROTECTED] nagios]# pstree -cpG | grep nagios
       ├─nagios(7943)───{nagios}(7944)

[EMAIL PROTECTED] tmp]# strace -s50 -p 7944
Process 7944 attached - interrupt to quit
poll([{fd=4, events=POLLIN}], 1, 500)   = 0
poll([{fd=4, events=POLLIN}], 1, 500)   = 0
poll([{fd=4, events=POLLIN}], 1, 500)   = 0
poll([{fd=4, events=POLLIN}], 1, 500)   = 0
poll([{fd=4, events=POLLIN}], 1, 500)   = 0
poll([{fd=4, events=POLLIN}], 1, 500)   = 0
poll([{fd=4, events=POLLIN}], 1, 500)   = 0
poll([{fd=4, events=POLLIN}], 1, 500)   = 0
poll([{fd=4, events=POLLIN}], 1, 500)   = 0
poll([{fd=4, events=POLLIN}], 1, 500)   = 0
poll([{fd=4, events=POLLIN}], 1, 500)   = 0
poll([{fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, "[1227291780] PROCESS_SERVICE_CHECK_RESULT;foopet"..., 4096) = 94
read(4, 0x2aaaaaaad000, 4096)           = -1 EAGAIN (Resource temporarily
unavailable)
poll([{fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, "[1227291780] PROCESS_SERVICE_CHECK_RESULT;foopet"..., 4096) = 92
read(4, 0x2aaaaaaad000, 4096)           = -1 EAGAIN (Resource temporarily
unavailable)
poll([{fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, "[1227291781] PROCESS_SERVICE_CHECK_RESULT;fooaptm"..., 4096) = 94
read(4, 0x2aaaaaaad000, 4096)           = -1 EAGAIN (Resource temporarily
unavailable)
poll([{fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, "[1227291781] PROCESS_SERVICE_CHECK_RESULT;fooaptm"..., 4096) = 92
read(4, 0x2aaaaaaad000, 4096)           = -1 EAGAIN (Resource temporarily
unavailable)
poll([{fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, "[1227291781] PROCESS_SERVICE_CHECK_RESULT;fooapet"..., 4096) = 93
read(4, 0x2aaaaaaad000, 4096)           = -1 EAGAIN (Resource temporarily
unavailable)
poll([{fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, "[1227291781] PROCESS_SERVICE_CHECK_RESULT;fooapet"..., 4096) = 94
read(4, 0x2aaaaaaad000, 4096)           = -1 EAGAIN (Resource temporarily
unavailable)
poll([{fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, "[1227291781] PROCESS_SERVICE_CHECK_RESULT;foopet"..., 4096) = 92
read(4, 0x2aaaaaaad000, 4096)           = -1 EAGAIN (Resource temporarily
unavailable)
poll([{fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1
read(4, "[1227291781] PROCESS_SERVICE_CHECK_RESULT;fooapet"..., 4096) = 94
read(4, 0x2aaaaaaad000, 4096)           = -1 EAGAIN (Resource temporarily
unavailable)
poll([{fd=4, events=POLLIN, revents=POLLIN}], 1, 500) = 1

[EMAIL PROTECTED] tmp]# ls -l /proc/7944/fd
total 0
lr-x------ 1 root root 64 Nov 21 13:14 0 -> /dev/null
l-wx------ 1 root root 64 Nov 21 13:14 1 -> /dev/null
l-wx------ 1 root root 64 Nov 21 13:14 2 -> /dev/null
lrwx------ 1 root root 64 Nov 21 13:14 3 -> /var/run/nagios.pid
lrwx------ 1 root root 64 Nov 21 13:14 4 -> /var/log/nagios/rw/nagios.cmd

The "EAGAIN/resource temporarily available" messages, is this normal?
If yes, what kind of output do I need to produce in order to verify/abandon
my gut feeling that nagios is not processing all results?
if no, any suggestions how to attack the problem?

Thank you in advance.

Regards,
Marc

server specs:

[EMAIL PROTECTED] tmp]# cat /etc/*release
Red Hat Enterprise Linux Server release 5.1 (Tikanga)
[EMAIL PROTECTED] tmp]# free -m
            total       used       free     shared    buffers     cached
Mem:         31905      23681       8224          0        553      15672

8 cpus
processor       : 7
vendor_id       : AuthenticAMD
cpu family      : 15
model           : 33
model name      : AMD Opteron (tm) Processor 880
stepping        : 2
cpu MHz         : 2400.000
cache size      : 1024 KB

[EMAIL PROTECTED] tmp]# /usr/local/nagios/sbin/nagios -v /etc/nagios/nagios.cfg

Nagios 3.0.3
Copyright (c) 1999-2008 Ethan Galstad (http://www.nagios.org)
Last Modified: 06-25-2008
License: GPL

Reading configuration data...

Running pre-flight check on configuration data...

Checking services...
       Checked 7491 services.
Checking hosts...
       Checked 460 hosts.
Checking host groups...
       Checked 30 host groups.
Checking service groups...
       Checked 0 service groups.
Checking contacts...
       Checked 3 contacts.
Checking contact groups...
       Checked 3 contact groups.
Checking service escalations...
       Checked 0 service escalations.
Checking service dependencies...
       Checked 0 service dependencies.
Checking host escalations...
       Checked 0 host escalations.
Checking host dependencies...
       Checked 0 host dependencies.
Checking commands...
       Checked 28 commands.
Checking time periods...
       Checked 6 time periods.
Checking for circular paths between hosts...
Checking for circular host and service dependencies...
Checking global event handlers...
Checking obsessive compulsive processor commands...
Checking misc settings...

Total Warnings: 0
Total Errors:   0

Things look okay - No serious problems were detected during the pre-flight
check
-------------------------------------------------------------------------
This SF.Net email is sponsored by the Moblin Your Move Developer's challenge
Build the coolest Linux based applications with Moblin SDK & win great prizes
Grand prize is a trip for two to an Open Source event anywhere in the world
http://moblin-contest.org/redirect.php?banner_id=100&url=/
_______________________________________________
Nagios-users mailing list
Nagios-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nagios-users
::: Please include Nagios version, plugin version (-v) and OS when reporting 
any issue. 
::: Messages without supporting info will risk being sent to /dev/null

Reply via email to