Hi,

I have a relayd config with two tables <webhosts> and <web9k> containing the 
same
hosts and with two redirects on different ports using these tables.

  ext_addr="10.12.33.59"
  webhost1="10.12.77.10"
  webhost2="10.12.77.11"
  #interval 2
  #timeout 500
  log all
  table <webhosts> { $webhost1 $webhost2 }
  table <web9k> { $webhost1 $webhost2 }
  redirect www {
         listen on $ext_addr port http
         match tag RELAYD
         forward to <webhosts> check http "/" code 200
  }
  redirect www9k {
         listen on $ext_addr port 9000
         match tag RELAYD
         forward to <web9k> check http "/" code 200
  } 

I see relayd crashes like this: (1)

startup
host 10.12.77.10, check http code (1ms), state unknown -> up, availability 
100.00%
host 10.12.77.11, check http code (2ms), state unknown -> up, availability 
100.00%
fatal: relay_dispatch_pfe: invalid host id
pfe exiting, pid 30837
hce exiting, pid 20192
fatal: relay_dispatch_pfe: invalid host id
relay exiting, pid 23812
parent terminating, pid 14729
relay exiting, pid 6320
relay exiting, pid 5411

or like this: (2)

startup
host 10.12.77.10, check http code (1ms), state unknown -> up, availability 
100.00%
host 10.12.77.11, check http code (2ms), state unknown -> up, availability 
100.00%
fatal: pfe_dispatch_hce: invalid host id
hce exiting, pid 20299
lost child: pfe exited abnormally
relay exiting, pid 23329
relay exiting, pid 13210
relay exiting, pid 7180
relay exiting, pid 23676
relay exiting, pid 4130
parent terminating, pid 23016

An easy way to cause this is to have no target servers running at all
(in this case i stopped the webservers), but it also happens when they
are really fast. When they are a little slower it does not happen.

There is a race of the hce and the other childs (pfe and relays)
between loading the configuration and start of processing IMSG_HOST_STATUS
messages.

The problem is that in hce_setup_events() the host checks are started before
all childs have all of the configuration.

A quick hack is to insert a sleep(1) at the beginning of hce_setup_events().

A fix might be to make 'invalid host id' non fatal:

diff --git a/usr.sbin/relayd/pfe.c b/usr.sbin/relayd/pfe.c
index 3830d33..281e8e4 100644
--- a/usr.sbin/relayd/pfe.c
+++ b/usr.sbin/relayd/pfe.c
@@ -110,8 +110,10 @@ pfe_dispatch_hce(int fd, struct privsep_proc *p, struct 
imsg *imsg)
        case IMSG_HOST_STATUS:
                IMSG_SIZE_CHECK(imsg, &st);
                memcpy(&st, imsg->data, sizeof(st));
-               if ((host = host_find(env, st.id)) == NULL)
-                       fatalx("pfe_dispatch_hce: invalid host id");
+               if ((host = host_find(env, st.id)) == NULL) {
+                       log_warnx("pfe_dispatch_hce: invalid host id");
+                       break;
+               }
                host->he = st.he;
                if (host->flags & F_DISABLE)
                        break;
diff --git a/usr.sbin/relayd/relay.c b/usr.sbin/relayd/relay.c
index 62ab44e..7c8494e 100644
--- a/usr.sbin/relayd/relay.c
+++ b/usr.sbin/relayd/relay.c
@@ -2471,8 +2471,10 @@ relay_dispatch_pfe(int fd, struct privsep_proc *p, 
struct imsg *imsg)
        case IMSG_HOST_STATUS:
                IMSG_SIZE_CHECK(imsg, &st);
                memcpy(&st, imsg->data, sizeof(st));
-               if ((host = host_find(env, st.id)) == NULL)
-                       fatalx("relay_dispatch_pfe: invalid host id");
+               if ((host = host_find(env, st.id)) == NULL) {
+                       log_warnx("relay_dispatch_pfe: invalid host id");
+                       break;
+               }
                if (host->flags & F_DISABLE)
                        break;
                if (host->up == st.up) {



Another might be to inhibit the processing of IMSG_HOST_STATUS only until
the configuration has been completed (that is after receiving IMSG_CFG_DONE):



diff --git a/usr.sbin/relayd/config.c b/usr.sbin/relayd/config.c
index ef185dc..8ade55f 100644
--- a/usr.sbin/relayd/config.c
+++ b/usr.sbin/relayd/config.c
@@ -131,6 +131,8 @@ config_init(struct relayd *env)
                TAILQ_INIT(env->sc_routes);
        }
 
+       env->active = PROC_INACTIVE;
+
        return (0);
 }
 
diff --git a/usr.sbin/relayd/pfe.c b/usr.sbin/relayd/pfe.c
index 3830d33..343a822 100644
--- a/usr.sbin/relayd/pfe.c
+++ b/usr.sbin/relayd/pfe.c
@@ -110,8 +110,14 @@ pfe_dispatch_hce(int fd, struct privsep_proc *p, struct 
imsg *imsg)
        case IMSG_HOST_STATUS:
                IMSG_SIZE_CHECK(imsg, &st);
                memcpy(&st, imsg->data, sizeof(st));
-               if ((host = host_find(env, st.id)) == NULL)
+               if ((host = host_find(env, st.id)) == NULL) {
+                       if (env->active == PROC_INACTIVE) {
+                               log_warnx("pfe_dispatch_hce: "
+                                   "invalid host id (not active)");
+                               break;
+                       }
                        fatalx("pfe_dispatch_hce: invalid host id");
+               }
                host->he = st.he;
                if (host->flags & F_DISABLE)
                        break;
@@ -201,6 +207,7 @@ pfe_dispatch_parent(int fd, struct privsep_proc *p, struct 
imsg *imsg)
                break;
        case IMSG_CFG_DONE:
                config_getcfg(env, imsg);
+               env->active = PROC_ACTIVE;
                init_filter(env, imsg->fd);
                init_tables(env);
                pfe_setup_events();
diff --git a/usr.sbin/relayd/relay.c b/usr.sbin/relayd/relay.c
index 62ab44e..1f114b5 100644
--- a/usr.sbin/relayd/relay.c
+++ b/usr.sbin/relayd/relay.c
@@ -333,6 +333,8 @@ relay_init(struct privsep *ps, struct privsep_proc *p, void 
*arg)
        if (config_init(ps->ps_env) == -1)
                fatal("failed to initialize configuration");
 
+       env->active = PROC_INACTIVE;
+
        /* We use a custom shutdown callback */
        p->p_shutdown = relay_shutdown;
 
@@ -2471,8 +2473,14 @@ relay_dispatch_pfe(int fd, struct privsep_proc *p, 
struct imsg *imsg)
        case IMSG_HOST_STATUS:
                IMSG_SIZE_CHECK(imsg, &st);
                memcpy(&st, imsg->data, sizeof(st));
-               if ((host = host_find(env, st.id)) == NULL)
+               if ((host = host_find(env, st.id)) == NULL) {
+                       if (env->active == PROC_INACTIVE) {
+                               log_warnx("relay_dispatch_pfe: "
+                                   "invalid host id (not active)");
+                               break;
+                       }
                        fatalx("relay_dispatch_pfe: invalid host id");
+               }
                if (host->flags & F_DISABLE)
                        break;
                if (host->up == st.up) {
@@ -2577,6 +2585,7 @@ relay_dispatch_parent(int fd, struct privsep_proc *p, 
struct imsg *imsg)
                break;
        case IMSG_CFG_DONE:
                config_getcfg(env, imsg);
+               env->active = PROC_ACTIVE;
                relay_launch();
                break;
        case IMSG_CTL_RESET:
diff --git a/usr.sbin/relayd/relayd.h b/usr.sbin/relayd/relayd.h
index 3d54045..3304c73 100644
--- a/usr.sbin/relayd/relayd.h
+++ b/usr.sbin/relayd/relayd.h
@@ -869,8 +869,13 @@ struct relayd {
 
        struct privsep          *sc_ps;
        int                      sc_reload;
+
+       int                      active;
 };
 
+#define PROC_INACTIVE 0x0
+#define PROC_ACTIVE   0x1
+
 #define RELAYD_OPT_VERBOSE             0x01
 #define RELAYD_OPT_NOACTION            0x04
 #define RELAYD_OPT_LOGUPDATE           0x08


Any comments?
/Benno

Reply via email to