Great to see progress in this area, I4m working on dmotion comeback ...
I4d go for (3) since it adds a real state, (1) is a hack and depends on
machine performance and load,
(2) is also not very beautiful ..
Tom
Am 05.12.2011 um 19:45 schrieb Sebastian Benoit:
> Hi,
>
> I have a relayd config with two tables <webhosts> and <web9k> containing the
same
> hosts and with two redirects on different ports using these tables.
>
> ext_addr="10.12.33.59"
> webhost1="10.12.77.10"
> webhost2="10.12.77.11"
> #interval 2
> #timeout 500
> log all
> table <webhosts> { $webhost1 $webhost2 }
> table <web9k> { $webhost1 $webhost2 }
> redirect www {
> listen on $ext_addr port http
> match tag RELAYD
> forward to <webhosts> check http "/" code 200
> }
> redirect www9k {
> listen on $ext_addr port 9000
> match tag RELAYD
> forward to <web9k> check http "/" code 200
> }
>
> I see relayd crashes like this: (1)
>
> startup
> host 10.12.77.10, check http code (1ms), state unknown -> up, availability
100.00%
> host 10.12.77.11, check http code (2ms), state unknown -> up, availability
100.00%
> fatal: relay_dispatch_pfe: invalid host id
> pfe exiting, pid 30837
> hce exiting, pid 20192
> fatal: relay_dispatch_pfe: invalid host id
> relay exiting, pid 23812
> parent terminating, pid 14729
> relay exiting, pid 6320
> relay exiting, pid 5411
>
> or like this: (2)
>
> startup
> host 10.12.77.10, check http code (1ms), state unknown -> up, availability
100.00%
> host 10.12.77.11, check http code (2ms), state unknown -> up, availability
100.00%
> fatal: pfe_dispatch_hce: invalid host id
> hce exiting, pid 20299
> lost child: pfe exited abnormally
> relay exiting, pid 23329
> relay exiting, pid 13210
> relay exiting, pid 7180
> relay exiting, pid 23676
> relay exiting, pid 4130
> parent terminating, pid 23016
>
> An easy way to cause this is to have no target servers running at all
> (in this case i stopped the webservers), but it also happens when they
> are really fast. When they are a little slower it does not happen.
>
> There is a race of the hce and the other childs (pfe and relays)
> between loading the configuration and start of processing IMSG_HOST_STATUS
> messages.
>
> The problem is that in hce_setup_events() the host checks are started
before
> all childs have all of the configuration.
>
> A quick hack is to insert a sleep(1) at the beginning of
hce_setup_events().
>
> A fix might be to make 'invalid host id' non fatal:
>
> diff --git a/usr.sbin/relayd/pfe.c b/usr.sbin/relayd/pfe.c
> index 3830d33..281e8e4 100644
> --- a/usr.sbin/relayd/pfe.c
> +++ b/usr.sbin/relayd/pfe.c
> @@ -110,8 +110,10 @@ pfe_dispatch_hce(int fd, struct privsep_proc *p, struct
imsg *imsg)
> case IMSG_HOST_STATUS:
> IMSG_SIZE_CHECK(imsg, &st);
> memcpy(&st, imsg->data, sizeof(st));
> - if ((host = host_find(env, st.id)) == NULL)
> - fatalx("pfe_dispatch_hce: invalid host id");
> + if ((host = host_find(env, st.id)) == NULL) {
> + log_warnx("pfe_dispatch_hce: invalid host id");
> + break;
> + }
> host->he = st.he;
> if (host->flags & F_DISABLE)
> break;
> diff --git a/usr.sbin/relayd/relay.c b/usr.sbin/relayd/relay.c
> index 62ab44e..7c8494e 100644
> --- a/usr.sbin/relayd/relay.c
> +++ b/usr.sbin/relayd/relay.c
> @@ -2471,8 +2471,10 @@ relay_dispatch_pfe(int fd, struct privsep_proc *p,
struct imsg *imsg)
> case IMSG_HOST_STATUS:
> IMSG_SIZE_CHECK(imsg, &st);
> memcpy(&st, imsg->data, sizeof(st));
> - if ((host = host_find(env, st.id)) == NULL)
> - fatalx("relay_dispatch_pfe: invalid host id");
> + if ((host = host_find(env, st.id)) == NULL) {
> + log_warnx("relay_dispatch_pfe: invalid host id");
> + break;
> + }
> if (host->flags & F_DISABLE)
> break;
> if (host->up == st.up) {
>
>
>
> Another might be to inhibit the processing of IMSG_HOST_STATUS only until
> the configuration has been completed (that is after receiving
IMSG_CFG_DONE):
>
>
>
> diff --git a/usr.sbin/relayd/config.c b/usr.sbin/relayd/config.c
> index ef185dc..8ade55f 100644
> --- a/usr.sbin/relayd/config.c
> +++ b/usr.sbin/relayd/config.c
> @@ -131,6 +131,8 @@ config_init(struct relayd *env)
> TAILQ_INIT(env->sc_routes);
> }
>
> + env->active = PROC_INACTIVE;
> +
> return (0);
> }
>
> diff --git a/usr.sbin/relayd/pfe.c b/usr.sbin/relayd/pfe.c
> index 3830d33..343a822 100644
> --- a/usr.sbin/relayd/pfe.c
> +++ b/usr.sbin/relayd/pfe.c
> @@ -110,8 +110,14 @@ pfe_dispatch_hce(int fd, struct privsep_proc *p, struct
imsg *imsg)
> case IMSG_HOST_STATUS:
> IMSG_SIZE_CHECK(imsg, &st);
> memcpy(&st, imsg->data, sizeof(st));
> - if ((host = host_find(env, st.id)) == NULL)
> + if ((host = host_find(env, st.id)) == NULL) {
> + if (env->active == PROC_INACTIVE) {
> + log_warnx("pfe_dispatch_hce: "
> + "invalid host id (not active)");
> + break;
> + }
> fatalx("pfe_dispatch_hce: invalid host id");
> + }
> host->he = st.he;
> if (host->flags & F_DISABLE)
> break;
> @@ -201,6 +207,7 @@ pfe_dispatch_parent(int fd, struct privsep_proc *p,
struct imsg *imsg)
> break;
> case IMSG_CFG_DONE:
> config_getcfg(env, imsg);
> + env->active = PROC_ACTIVE;
> init_filter(env, imsg->fd);
> init_tables(env);
> pfe_setup_events();
> diff --git a/usr.sbin/relayd/relay.c b/usr.sbin/relayd/relay.c
> index 62ab44e..1f114b5 100644
> --- a/usr.sbin/relayd/relay.c
> +++ b/usr.sbin/relayd/relay.c
> @@ -333,6 +333,8 @@ relay_init(struct privsep *ps, struct privsep_proc *p,
void *arg)
> if (config_init(ps->ps_env) == -1)
> fatal("failed to initialize configuration");
>
> + env->active = PROC_INACTIVE;
> +
> /* We use a custom shutdown callback */
> p->p_shutdown = relay_shutdown;
>
> @@ -2471,8 +2473,14 @@ relay_dispatch_pfe(int fd, struct privsep_proc *p,
struct imsg *imsg)
> case IMSG_HOST_STATUS:
> IMSG_SIZE_CHECK(imsg, &st);
> memcpy(&st, imsg->data, sizeof(st));
> - if ((host = host_find(env, st.id)) == NULL)
> + if ((host = host_find(env, st.id)) == NULL) {
> + if (env->active == PROC_INACTIVE) {
> + log_warnx("relay_dispatch_pfe: "
> + "invalid host id (not active)");
> + break;
> + }
> fatalx("relay_dispatch_pfe: invalid host id");
> + }
> if (host->flags & F_DISABLE)
> break;
> if (host->up == st.up) {
> @@ -2577,6 +2585,7 @@ relay_dispatch_parent(int fd, struct privsep_proc *p,
struct imsg *imsg)
> break;
> case IMSG_CFG_DONE:
> config_getcfg(env, imsg);
> + env->active = PROC_ACTIVE;
> relay_launch();
> break;
> case IMSG_CTL_RESET:
> diff --git a/usr.sbin/relayd/relayd.h b/usr.sbin/relayd/relayd.h
> index 3d54045..3304c73 100644
> --- a/usr.sbin/relayd/relayd.h
> +++ b/usr.sbin/relayd/relayd.h
> @@ -869,8 +869,13 @@ struct relayd {
>
> struct privsep *sc_ps;
> int sc_reload;
> +
> + int active;
> };
>
> +#define PROC_INACTIVE 0x0
> +#define PROC_ACTIVE 0x1
> +
> #define RELAYD_OPT_VERBOSE 0x01
> #define RELAYD_OPT_NOACTION 0x04
> #define RELAYD_OPT_LOGUPDATE 0x08
>
>
> Any comments?
> /Benno