Hi,
I have a relayd config with two tables <webhosts> and <web9k> containing the
same
hosts and with two redirects on different ports using these tables.
ext_addr="10.12.33.59"
webhost1="10.12.77.10"
webhost2="10.12.77.11"
#interval 2
#timeout 500
log all
table <webhosts> { $webhost1 $webhost2 }
table <web9k> { $webhost1 $webhost2 }
redirect www {
listen on $ext_addr port http
match tag RELAYD
forward to <webhosts> check http "/" code 200
}
redirect www9k {
listen on $ext_addr port 9000
match tag RELAYD
forward to <web9k> check http "/" code 200
}
I see relayd crashes like this: (1)
startup
host 10.12.77.10, check http code (1ms), state unknown -> up, availability
100.00%
host 10.12.77.11, check http code (2ms), state unknown -> up, availability
100.00%
fatal: relay_dispatch_pfe: invalid host id
pfe exiting, pid 30837
hce exiting, pid 20192
fatal: relay_dispatch_pfe: invalid host id
relay exiting, pid 23812
parent terminating, pid 14729
relay exiting, pid 6320
relay exiting, pid 5411
or like this: (2)
startup
host 10.12.77.10, check http code (1ms), state unknown -> up, availability
100.00%
host 10.12.77.11, check http code (2ms), state unknown -> up, availability
100.00%
fatal: pfe_dispatch_hce: invalid host id
hce exiting, pid 20299
lost child: pfe exited abnormally
relay exiting, pid 23329
relay exiting, pid 13210
relay exiting, pid 7180
relay exiting, pid 23676
relay exiting, pid 4130
parent terminating, pid 23016
An easy way to cause this is to have no target servers running at all
(in this case i stopped the webservers), but it also happens when they
are really fast. When they are a little slower it does not happen.
There is a race of the hce and the other childs (pfe and relays)
between loading the configuration and start of processing IMSG_HOST_STATUS
messages.
The problem is that in hce_setup_events() the host checks are started before
all childs have all of the configuration.
A quick hack is to insert a sleep(1) at the beginning of hce_setup_events().
A fix might be to make 'invalid host id' non fatal:
diff --git a/usr.sbin/relayd/pfe.c b/usr.sbin/relayd/pfe.c
index 3830d33..281e8e4 100644
--- a/usr.sbin/relayd/pfe.c
+++ b/usr.sbin/relayd/pfe.c
@@ -110,8 +110,10 @@ pfe_dispatch_hce(int fd, struct privsep_proc *p, struct
imsg *imsg)
case IMSG_HOST_STATUS:
IMSG_SIZE_CHECK(imsg, &st);
memcpy(&st, imsg->data, sizeof(st));
- if ((host = host_find(env, st.id)) == NULL)
- fatalx("pfe_dispatch_hce: invalid host id");
+ if ((host = host_find(env, st.id)) == NULL) {
+ log_warnx("pfe_dispatch_hce: invalid host id");
+ break;
+ }
host->he = st.he;
if (host->flags & F_DISABLE)
break;
diff --git a/usr.sbin/relayd/relay.c b/usr.sbin/relayd/relay.c
index 62ab44e..7c8494e 100644
--- a/usr.sbin/relayd/relay.c
+++ b/usr.sbin/relayd/relay.c
@@ -2471,8 +2471,10 @@ relay_dispatch_pfe(int fd, struct privsep_proc *p,
struct imsg *imsg)
case IMSG_HOST_STATUS:
IMSG_SIZE_CHECK(imsg, &st);
memcpy(&st, imsg->data, sizeof(st));
- if ((host = host_find(env, st.id)) == NULL)
- fatalx("relay_dispatch_pfe: invalid host id");
+ if ((host = host_find(env, st.id)) == NULL) {
+ log_warnx("relay_dispatch_pfe: invalid host id");
+ break;
+ }
if (host->flags & F_DISABLE)
break;
if (host->up == st.up) {
Another might be to inhibit the processing of IMSG_HOST_STATUS only until
the configuration has been completed (that is after receiving IMSG_CFG_DONE):
diff --git a/usr.sbin/relayd/config.c b/usr.sbin/relayd/config.c
index ef185dc..8ade55f 100644
--- a/usr.sbin/relayd/config.c
+++ b/usr.sbin/relayd/config.c
@@ -131,6 +131,8 @@ config_init(struct relayd *env)
TAILQ_INIT(env->sc_routes);
}
+ env->active = PROC_INACTIVE;
+
return (0);
}
diff --git a/usr.sbin/relayd/pfe.c b/usr.sbin/relayd/pfe.c
index 3830d33..343a822 100644
--- a/usr.sbin/relayd/pfe.c
+++ b/usr.sbin/relayd/pfe.c
@@ -110,8 +110,14 @@ pfe_dispatch_hce(int fd, struct privsep_proc *p, struct
imsg *imsg)
case IMSG_HOST_STATUS:
IMSG_SIZE_CHECK(imsg, &st);
memcpy(&st, imsg->data, sizeof(st));
- if ((host = host_find(env, st.id)) == NULL)
+ if ((host = host_find(env, st.id)) == NULL) {
+ if (env->active == PROC_INACTIVE) {
+ log_warnx("pfe_dispatch_hce: "
+ "invalid host id (not active)");
+ break;
+ }
fatalx("pfe_dispatch_hce: invalid host id");
+ }
host->he = st.he;
if (host->flags & F_DISABLE)
break;
@@ -201,6 +207,7 @@ pfe_dispatch_parent(int fd, struct privsep_proc *p, struct
imsg *imsg)
break;
case IMSG_CFG_DONE:
config_getcfg(env, imsg);
+ env->active = PROC_ACTIVE;
init_filter(env, imsg->fd);
init_tables(env);
pfe_setup_events();
diff --git a/usr.sbin/relayd/relay.c b/usr.sbin/relayd/relay.c
index 62ab44e..1f114b5 100644
--- a/usr.sbin/relayd/relay.c
+++ b/usr.sbin/relayd/relay.c
@@ -333,6 +333,8 @@ relay_init(struct privsep *ps, struct privsep_proc *p, void
*arg)
if (config_init(ps->ps_env) == -1)
fatal("failed to initialize configuration");
+ env->active = PROC_INACTIVE;
+
/* We use a custom shutdown callback */
p->p_shutdown = relay_shutdown;
@@ -2471,8 +2473,14 @@ relay_dispatch_pfe(int fd, struct privsep_proc *p,
struct imsg *imsg)
case IMSG_HOST_STATUS:
IMSG_SIZE_CHECK(imsg, &st);
memcpy(&st, imsg->data, sizeof(st));
- if ((host = host_find(env, st.id)) == NULL)
+ if ((host = host_find(env, st.id)) == NULL) {
+ if (env->active == PROC_INACTIVE) {
+ log_warnx("relay_dispatch_pfe: "
+ "invalid host id (not active)");
+ break;
+ }
fatalx("relay_dispatch_pfe: invalid host id");
+ }
if (host->flags & F_DISABLE)
break;
if (host->up == st.up) {
@@ -2577,6 +2585,7 @@ relay_dispatch_parent(int fd, struct privsep_proc *p,
struct imsg *imsg)
break;
case IMSG_CFG_DONE:
config_getcfg(env, imsg);
+ env->active = PROC_ACTIVE;
relay_launch();
break;
case IMSG_CTL_RESET:
diff --git a/usr.sbin/relayd/relayd.h b/usr.sbin/relayd/relayd.h
index 3d54045..3304c73 100644
--- a/usr.sbin/relayd/relayd.h
+++ b/usr.sbin/relayd/relayd.h
@@ -869,8 +869,13 @@ struct relayd {
struct privsep *sc_ps;
int sc_reload;
+
+ int active;
};
+#define PROC_INACTIVE 0x0
+#define PROC_ACTIVE 0x1
+
#define RELAYD_OPT_VERBOSE 0x01
#define RELAYD_OPT_NOACTION 0x04
#define RELAYD_OPT_LOGUPDATE 0x08
Any comments?
/Benno