>Number:         6627
>Category:       system
>Synopsis:       relayd - desynchronization on host change during a running 
>check
>Confidential:   yes
>Severity:       serious
>Priority:       medium
>Responsible:    bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   unknown
>Arrival-Date:   Mon Jun 06 14:30:01 GMT 2011
>Closed-Date:
>Last-Modified:
>Originator:     
>Release:        
>Organization:
>Environment:
        System      : OpenBSD 4.9
        Machine     : amd64
>Description:
        If you disable a host during a running check and enable later,
        relayd desynchronizes. Applies probably to all relayd versions.
        Suggested fix approach attached.
>How-To-Repeat:
        Disable a host during a running check of this host - this changes the
        host state. The check result is processed - this changes host state 
again.
        Then enable the host after the check result is processed - relayd
        desynchronizes on next state change.

Alternative 1:
--------------
/etc/relayd.conf:
interval 30
timeout 15000

localhost="127.0.0.1"
# non-existing remote IP
remote="10.0.3.101"
table <host_table> { $remote }

# Problem also with "redirect redirect1 {"
relay relay1 {
        listen on $localhost port 8000
        forward to <host_table> port http check tcp
}

Alternative 2:
--------------
/etc/relayd.conf:
interval 30
timeout 15000

localhost="127.0.0.1"
table <host_table> { $localhost }

relay relay1 {
        listen on $localhost port 8000
        forward to <host_table> port http check script "/tmp/check.sh"
}

/tmp/check.sh
#!/bin/sh
sleep 5
exit 1

Alternative 1 relayd debug output:
----------------------------------
# relayd -d -vv          
startup
config_settable: sending table host_table:80 1 to hce
proc_run: pfe 1/1, pid 1744
config_settable: sending table host_table:80 1 to relay
proc_run: hce 1/1, pid 20454
proc_run: relay 1/5, pid 25090
config_settable: sending table host_table:80 1 to pfe
socket_rlimit: max open files 1024
proc_run: relay 2/5, pid 20626
proc_run: relay 3/5, pid 6630
proc_run: relay 4/5, pid 19157
proc_run: relay 5/5, pid 17023
socket_rlimit: max open files 1024
relay_privinit: adding relay relay1
socket_rlimit: max open files 1024
socket_rlimit: max open files 1024
socket_rlimit: max open files 1024
socket_rlimit: max open files 1024
protocol -1: name default
        flags: used, relay flags: 
        type: tcp
config_setrelay: sending relay relay1 to relay fd 4
config_setrelay: sending relay relay1 to pfe fd 4
proc_dispatch: hce 1 got imsg 44 from parent 0
proc_dispatch: relay 1 got imsg 44 from parent 0
config_gettable: hce 0 received table 1 (host_table:80)
proc_dispatch: relay 2 got imsg 44 from parent 1
config_gettable: relay 0 received table 1 (host_table:80)
proc_dispatch: hce 1 got imsg 45 from parent 0
proc_dispatch: relay 3 got imsg 44 from parent 2
config_gettable: relay 1 received table 1 (host_table:80)
proc_dispatch: relay 1 got imsg 45 from parent 0
config_gethost: hce 0 received host 10.0.3.101 for table host_table:80
proc_dispatch: relay 4 got imsg 44 from parent 3
config_gettable: relay 2 received table 1 (host_table:80)
proc_dispatch: relay 2 got imsg 45 from parent 1
config_gethost: relay 0 received host 10.0.3.101 for table host_table:80
proc_dispatch: hce 1 got imsg 53 from parent 0
proc_dispatch: relay 5 got imsg 44 from parent 4
config_gettable: relay 3 received table 1 (host_table:80)
proc_dispatch: relay 3 got imsg 45 from parent 2
config_gethost: relay 1 received host 10.0.3.101 for table host_table:80
proc_dispatch: relay 1 got imsg 52 from parent 0
proc_dispatch: pfe 1 got imsg 36 from hce 0
proc_dispatch: parent 1 got imsg 53 from hce 0
config_gettable: relay 4 received table 1 (host_table:80)
proc_dispatch: relay 4 got imsg 45 from parent 3
config_gethost: relay 2 received host 10.0.3.101 for table host_table:80
proc_dispatch: relay 2 got imsg 52 from parent 1
config_getrelay: relay 0 received relay relay1
proc_dispatch: pfe 1 got imsg 44 from parent 0
proc_dispatch: relay 5 got imsg 45 from parent 4
config_gethost: relay 3 received host 10.0.3.101 for table host_table:80
proc_dispatch: relay 3 got imsg 52 from parent 2
config_getrelay: relay 1 received relay relay1
proc_dispatch: relay 1 got imsg 53 from parent 0
config_gettable: pfe 0 received table 1 (host_table:80)
config_gethost: relay 4 received host 10.0.3.101 for table host_table:80
proc_dispatch: relay 4 got imsg 52 from parent 3
config_getrelay: relay 2 received relay relay1
proc_dispatch: relay 2 got imsg 53 from parent 1
adding 1 hosts from table host_table:80
proc_dispatch: pfe 1 got imsg 45 from parent 0
proc_dispatch: relay 5 got imsg 52 from parent 4
config_getrelay: relay 3 received relay relay1
proc_dispatch: relay 3 got imsg 53 from parent 2
adding 1 hosts from table host_table:80
relay_launch: running relay relay1
config_gethost: pfe 0 received host 10.0.3.101 for table host_table:80
config_getrelay: relay 4 received relay relay1
proc_dispatch: relay 4 got imsg 53 from parent 3
adding 1 hosts from table host_table:80
relay_launch: running relay relay1
proc_dispatch: parent 1 got imsg 53 from relay 4
proc_dispatch: pfe 1 got imsg 52 from parent 0
proc_dispatch: relay 5 got imsg 53 from parent 4
adding 1 hosts from table host_table:80
relay_launch: running relay relay1
proc_dispatch: parent 1 got imsg 53 from relay 4
config_getrelay: pfe 0 received relay relay1
adding 1 hosts from table host_table:80
relay_launch: running relay relay1
proc_dispatch: parent 1 got imsg 53 from relay 4
proc_dispatch: pfe 1 got imsg 53 from parent 0
relay_launch: running relay relay1
proc_dispatch: parent 1 got imsg 53 from relay 4
proc_dispatch: parent 1 got imsg 53 from pfe 0
proc_dispatch: parent 1 got imsg 53 from relay 4
disable_host: host 1
proc_dispatch: hce 1 got imsg 34 from pfe 0
proc_dispatch: relay 1 got imsg 34 from pfe 0
proc_dispatch: relay 2 got imsg 34 from pfe 1
proc_dispatch: relay 3 got imsg 34 from pfe 2
proc_dispatch: relay 4 got imsg 34 from pfe 3
proc_dispatch: relay 5 got imsg 34 from pfe 4
hce_notify_done: 10.0.3.101 (tcp connect timeout)
host 10.0.3.101, check tcp (15013ms), state unknown -> down, availability 0.00%
proc_dispatch: pfe 1 got imsg 35 from hce 0
enable_host: host 1
proc_dispatch: hce 1 got imsg 33 from pfe 0
proc_dispatch: relay 1 got imsg 33 from pfe 0
proc_dispatch: relay 2 got imsg 33 from pfe 1
proc_dispatch: relay 3 got imsg 33 from pfe 2
proc_dispatch: relay 4 got imsg 33 from pfe 3
proc_dispatch: relay 5 got imsg 33 from pfe 4
proc_dispatch: pfe 1 got imsg 36 from hce 0
hce_notify_done: 10.0.3.101 (tcp connect timeout)
host 10.0.3.101, check tcp (15010ms), state unknown -> down, availability 0.00%
proc_dispatch: pfe 1 got imsg 35 from hce 0
pfe_dispatch_hce: host 1 => 0
fatal: pfe_dispatch_imsg: desynchronized
hce exiting, pid 20454
lost child: pfe exited abnormally
relay exiting, pid 25090
relay exiting, pid 20626
relay exiting, pid 6630
relay exiting, pid 19157
relay exiting, pid 17023
parent terminating, pid 6270

Alternative 2 relayd debug output
---------------------------------
# relayd -d -vv 
startup
config_settable: sending table host_table:80 1 to hce
proc_run: pfe 1/1, pid 31295
config_settable: sending table host_table:80 1 to relay
proc_run: hce 1/1, pid 17714
proc_run: relay 1/5, pid 855
config_settable: sending table host_table:80 1 to pfe
socket_rlimit: max open files 1024
proc_run: relay 2/5, pid 22109
proc_run: relay 3/5, pid 4931
proc_run: relay 4/5, pid 6864
proc_run: relay 5/5, pid 5936
socket_rlimit: max open files 1024
relay_privinit: adding relay relay1
socket_rlimit: max open files 1024
socket_rlimit: max open files 1024
socket_rlimit: max open files 1024
socket_rlimit: max open files 1024
protocol -1: name default
        flags: used, relay flags: 
        type: tcp
config_setrelay: sending relay relay1 to relay fd 4
config_setrelay: sending relay relay1 to pfe fd 4
proc_dispatch: hce 1 got imsg 44 from parent 0
proc_dispatch: relay 1 got imsg 44 from parent 0
config_gettable: hce 0 received table 1 (host_table:80)
proc_dispatch: relay 2 got imsg 44 from parent 1
config_gettable: relay 0 received table 1 (host_table:80)
proc_dispatch: hce 1 got imsg 45 from parent 0
proc_dispatch: relay 3 got imsg 44 from parent 2
config_gettable: relay 1 received table 1 (host_table:80)
proc_dispatch: relay 1 got imsg 45 from parent 0
config_gethost: hce 0 received host 127.0.0.1 for table host_table:80
proc_dispatch: relay 4 got imsg 44 from parent 3
config_gettable: relay 2 received table 1 (host_table:80)
proc_dispatch: relay 2 got imsg 45 from parent 1
config_gethost: relay 0 received host 127.0.0.1 for table host_table:80
proc_dispatch: hce 1 got imsg 53 from parent 0
proc_dispatch: relay 5 got imsg 44 from parent 4
config_gettable: relay 3 received table 1 (host_table:80)
proc_dispatch: relay 3 got imsg 45 from parent 2
config_gethost: relay 1 received host 127.0.0.1 for table host_table:80
proc_dispatch: relay 1 got imsg 52 from parent 0
proc_dispatch: pfe 1 got imsg 36 from hce 0
proc_dispatch: parent 1 got imsg 53 from hce 0
config_gettable: relay 4 received table 1 (host_table:80)
proc_dispatch: relay 4 got imsg 45 from parent 3
config_gethost: relay 2 received host 127.0.0.1 for table host_table:80
proc_dispatch: relay 2 got imsg 52 from parent 1
config_getrelay: relay 0 received relay relay1
proc_dispatch: pfe 1 got imsg 44 from parent 0
proc_dispatch: parent 1 got imsg 40 from hce 0
proc_dispatch: relay 5 got imsg 45 from parent 4
config_gethost: relay 3 received host 127.0.0.1 for table host_table:80
proc_dispatch: relay 3 got imsg 52 from parent 2
config_getrelay: relay 1 received relay relay1
proc_dispatch: relay 1 got imsg 53 from parent 0
config_gettable: pfe 0 received table 1 (host_table:80)
script_exec: running script /tmp/chk_test.sh, host 127.0.0.1
config_gethost: relay 4 received host 127.0.0.1 for table host_table:80
proc_dispatch: relay 4 got imsg 52 from parent 3
config_getrelay: relay 2 received relay relay1
proc_dispatch: relay 2 got imsg 53 from parent 1
adding 1 hosts from table host_table:80
proc_dispatch: pfe 1 got imsg 45 from parent 0
proc_dispatch: relay 5 got imsg 52 from parent 4
config_getrelay: relay 3 received relay relay1
proc_dispatch: relay 3 got imsg 53 from parent 2
adding 1 hosts from table host_table:80
relay_launch: running relay relay1
config_gethost: pfe 0 received host 127.0.0.1 for table host_table:80
config_getrelay: relay 4 received relay relay1
proc_dispatch: relay 4 got imsg 53 from parent 3
adding 1 hosts from table host_table:80
relay_launch: running relay relay1
proc_dispatch: pfe 1 got imsg 52 from parent 0
proc_dispatch: relay 5 got imsg 53 from parent 4
adding 1 hosts from table host_table:80
relay_launch: running relay relay1
config_getrelay: pfe 0 received relay relay1
adding 1 hosts from table host_table:80
relay_launch: running relay relay1
proc_dispatch: pfe 1 got imsg 53 from parent 0
relay_launch: running relay relay1
disable_host: host 1
proc_dispatch: hce 1 got imsg 34 from pfe 0
proc_dispatch: relay 1 got imsg 34 from pfe 0
proc_dispatch: relay 2 got imsg 34 from pfe 1
proc_dispatch: relay 3 got imsg 34 from pfe 2
proc_dispatch: relay 4 got imsg 34 from pfe 3
proc_dispatch: relay 5 got imsg 34 from pfe 4
proc_dispatch: parent 1 got imsg 53 from relay 4
proc_dispatch: parent 1 got imsg 53 from relay 4
proc_dispatch: parent 1 got imsg 53 from relay 4
proc_dispatch: parent 1 got imsg 53 from relay 4
proc_dispatch: parent 1 got imsg 53 from pfe 0
proc_dispatch: parent 1 got imsg 53 from relay 4
proc_dispatch: hce 1 got imsg 40 from parent 0
hce_notify_done: 127.0.0.1 (script ok)
host 127.0.0.1, check script (5014ms), state unknown -> up, availability 100.00%
proc_dispatch: pfe 1 got imsg 35 from hce 0
enable_host: host 1
proc_dispatch: hce 1 got imsg 33 from pfe 0
proc_dispatch: relay 1 got imsg 33 from pfe 0
proc_dispatch: relay 2 got imsg 33 from pfe 1
proc_dispatch: relay 3 got imsg 33 from pfe 2
proc_dispatch: relay 4 got imsg 33 from pfe 3
proc_dispatch: relay 5 got imsg 33 from pfe 4
proc_dispatch: pfe 1 got imsg 36 from hce 0
proc_dispatch: parent 1 got imsg 40 from hce 0
script_exec: running script /tmp/chk_test.sh, host 127.0.0.1
proc_dispatch: hce 1 got imsg 40 from parent 0
hce_notify_done: 127.0.0.1 (script ok)
host 127.0.0.1, check script (5012ms), state unknown -> up, availability 100.00%
proc_dispatch: pfe 1 got imsg 35 from hce 0
pfe_dispatch_hce: host 1 => 0
fatal: pfe_dispatch_imsg: desynchronized
hce exiting, pid 17714
lost child: pfe exited abnormally
relay exiting, pid 855
relay exiting, pid 22109
relay exiting, pid 4931
relay exiting, pid 6864
relay exiting, pid 5936
parent terminating, pid 25099
>Fix:
        Here is a possible approach of fixing the problem - hce_notify_done
        should not process the check result of a test if its host was
        disabled during the check. The passed cte->host doesn't know that
        the host has been disabled yet so it has to be verified against env.

--- src/usr.sbin/relayd/hce.c.orig      Mon Jun  6 18:02:45 2011
+++ src/usr.sbin/relayd/hce.c   Mon Jun  6 18:09:07 2011
@@ -207,10 +207,27 @@
        struct timeval           tv_now, tv_dur;
        u_long                   duration;
        u_int                    logopt;
-       struct host             *h;
+       struct host             *h, *hostupd;
        int                      hostup;
        const char              *msg;
 
+       if ((hostupd = host_find(env, host->conf.id)) == NULL)
+               fatalx("hce_notify_done: desynchronized");
+
+       if ((table = table_find(env, host->conf.tableid)) == NULL)
+               fatalx("hce_notify_done: invalid table id");
+
+       if (hostupd->flags & F_DISABLE) {
+               if (env->sc_opts & RELAYD_OPT_LOGUPDATE) {
+                       log_info("host %s, check %s%s (ignoring result, "
+                           "host disabled)",
+                           host->conf.name, table_check(table->conf.check),
+                           (table->conf.flags & F_SSL) ? " use ssl" : "");
+               }
+               host->flags |= (F_CHECK_SENT|F_CHECK_DONE);
+               return;
+       }
+
        hostup = host->up;
        host->he = he;
 
@@ -250,9 +267,6 @@
                duration = (tv_dur.tv_sec * 1000) + (tv_dur.tv_usec / 1000.0);
        else
                duration = 0;
-
-       if ((table = table_find(env, host->conf.tableid)) == NULL)
-               fatalx("hce_notify_done: invalid table id");
 
        if (env->sc_opts & logopt) {
                log_info("host %s, check %s%s (%lums), state %s -> %s, "


>Release-Note:
>Audit-Trail:
>Unformatted:

Reply via email to