Great to see progress in this area, I4m working on dmotion comeback ...

I4d go for (3) since it adds a real state, (1) is a hack and depends on
machine performance and load,
(2) is also not very beautiful ..

Tom


Am 05.12.2011 um 19:45 schrieb Sebastian Benoit:

> Hi,
>
> I have a relayd config with two tables <webhosts> and <web9k> containing the
same
> hosts and with two redirects on different ports using these tables.
>
>  ext_addr="10.12.33.59"
>  webhost1="10.12.77.10"
>  webhost2="10.12.77.11"
>  #interval 2
>  #timeout 500
>  log all
>  table <webhosts> { $webhost1 $webhost2 }
>  table <web9k> { $webhost1 $webhost2 }
>  redirect www {
>         listen on $ext_addr port http
>         match tag RELAYD
>         forward to <webhosts> check http "/" code 200
>  }
>  redirect www9k {
>         listen on $ext_addr port 9000
>         match tag RELAYD
>         forward to <web9k> check http "/" code 200
>  }
>
> I see relayd crashes like this: (1)
>
> startup
> host 10.12.77.10, check http code (1ms), state unknown -> up, availability
100.00%
> host 10.12.77.11, check http code (2ms), state unknown -> up, availability
100.00%
> fatal: relay_dispatch_pfe: invalid host id
> pfe exiting, pid 30837
> hce exiting, pid 20192
> fatal: relay_dispatch_pfe: invalid host id
> relay exiting, pid 23812
> parent terminating, pid 14729
> relay exiting, pid 6320
> relay exiting, pid 5411
>
> or like this: (2)
>
> startup
> host 10.12.77.10, check http code (1ms), state unknown -> up, availability
100.00%
> host 10.12.77.11, check http code (2ms), state unknown -> up, availability
100.00%
> fatal: pfe_dispatch_hce: invalid host id
> hce exiting, pid 20299
> lost child: pfe exited abnormally
> relay exiting, pid 23329
> relay exiting, pid 13210
> relay exiting, pid 7180
> relay exiting, pid 23676
> relay exiting, pid 4130
> parent terminating, pid 23016
>
> An easy way to cause this is to have no target servers running at all
> (in this case i stopped the webservers), but it also happens when they
> are really fast. When they are a little slower it does not happen.
>
> There is a race of the hce and the other childs (pfe and relays)
> between loading the configuration and start of processing IMSG_HOST_STATUS
> messages.
>
> The problem is that in hce_setup_events() the host checks are started
before
> all childs have all of the configuration.
>
> A quick hack is to insert a sleep(1) at the beginning of
hce_setup_events().
>
> A fix might be to make 'invalid host id' non fatal:
>
> diff --git a/usr.sbin/relayd/pfe.c b/usr.sbin/relayd/pfe.c
> index 3830d33..281e8e4 100644
> --- a/usr.sbin/relayd/pfe.c
> +++ b/usr.sbin/relayd/pfe.c
> @@ -110,8 +110,10 @@ pfe_dispatch_hce(int fd, struct privsep_proc *p, struct
imsg *imsg)
>       case IMSG_HOST_STATUS:
>               IMSG_SIZE_CHECK(imsg, &st);
>               memcpy(&st, imsg->data, sizeof(st));
> -             if ((host = host_find(env, st.id)) == NULL)
> -                     fatalx("pfe_dispatch_hce: invalid host id");
> +             if ((host = host_find(env, st.id)) == NULL) {
> +                     log_warnx("pfe_dispatch_hce: invalid host id");
> +                     break;
> +             }
>               host->he = st.he;
>               if (host->flags & F_DISABLE)
>                       break;
> diff --git a/usr.sbin/relayd/relay.c b/usr.sbin/relayd/relay.c
> index 62ab44e..7c8494e 100644
> --- a/usr.sbin/relayd/relay.c
> +++ b/usr.sbin/relayd/relay.c
> @@ -2471,8 +2471,10 @@ relay_dispatch_pfe(int fd, struct privsep_proc *p,
struct imsg *imsg)
>       case IMSG_HOST_STATUS:
>               IMSG_SIZE_CHECK(imsg, &st);
>               memcpy(&st, imsg->data, sizeof(st));
> -             if ((host = host_find(env, st.id)) == NULL)
> -                     fatalx("relay_dispatch_pfe: invalid host id");
> +             if ((host = host_find(env, st.id)) == NULL) {
> +                     log_warnx("relay_dispatch_pfe: invalid host id");
> +                     break;
> +             }
>               if (host->flags & F_DISABLE)
>                       break;
>               if (host->up == st.up) {
>
>
>
> Another might be to inhibit the processing of IMSG_HOST_STATUS only until
> the configuration has been completed (that is after receiving
IMSG_CFG_DONE):
>
>
>
> diff --git a/usr.sbin/relayd/config.c b/usr.sbin/relayd/config.c
> index ef185dc..8ade55f 100644
> --- a/usr.sbin/relayd/config.c
> +++ b/usr.sbin/relayd/config.c
> @@ -131,6 +131,8 @@ config_init(struct relayd *env)
>               TAILQ_INIT(env->sc_routes);
>       }
>
> +     env->active = PROC_INACTIVE;
> +
>       return (0);
> }
>
> diff --git a/usr.sbin/relayd/pfe.c b/usr.sbin/relayd/pfe.c
> index 3830d33..343a822 100644
> --- a/usr.sbin/relayd/pfe.c
> +++ b/usr.sbin/relayd/pfe.c
> @@ -110,8 +110,14 @@ pfe_dispatch_hce(int fd, struct privsep_proc *p, struct
imsg *imsg)
>       case IMSG_HOST_STATUS:
>               IMSG_SIZE_CHECK(imsg, &st);
>               memcpy(&st, imsg->data, sizeof(st));
> -             if ((host = host_find(env, st.id)) == NULL)
> +             if ((host = host_find(env, st.id)) == NULL) {
> +                     if (env->active == PROC_INACTIVE) {
> +                             log_warnx("pfe_dispatch_hce: "
> +                                 "invalid host id (not active)");
> +                             break;
> +                     }
>                       fatalx("pfe_dispatch_hce: invalid host id");
> +             }
>               host->he = st.he;
>               if (host->flags & F_DISABLE)
>                       break;
> @@ -201,6 +207,7 @@ pfe_dispatch_parent(int fd, struct privsep_proc *p,
struct imsg *imsg)
>               break;
>       case IMSG_CFG_DONE:
>               config_getcfg(env, imsg);
> +             env->active = PROC_ACTIVE;
>               init_filter(env, imsg->fd);
>               init_tables(env);
>               pfe_setup_events();
> diff --git a/usr.sbin/relayd/relay.c b/usr.sbin/relayd/relay.c
> index 62ab44e..1f114b5 100644
> --- a/usr.sbin/relayd/relay.c
> +++ b/usr.sbin/relayd/relay.c
> @@ -333,6 +333,8 @@ relay_init(struct privsep *ps, struct privsep_proc *p,
void *arg)
>       if (config_init(ps->ps_env) == -1)
>               fatal("failed to initialize configuration");
>
> +     env->active = PROC_INACTIVE;
> +
>       /* We use a custom shutdown callback */
>       p->p_shutdown = relay_shutdown;
>
> @@ -2471,8 +2473,14 @@ relay_dispatch_pfe(int fd, struct privsep_proc *p,
struct imsg *imsg)
>       case IMSG_HOST_STATUS:
>               IMSG_SIZE_CHECK(imsg, &st);
>               memcpy(&st, imsg->data, sizeof(st));
> -             if ((host = host_find(env, st.id)) == NULL)
> +             if ((host = host_find(env, st.id)) == NULL) {
> +                     if (env->active == PROC_INACTIVE) {
> +                             log_warnx("relay_dispatch_pfe: "
> +                                 "invalid host id (not active)");
> +                             break;
> +                     }
>                       fatalx("relay_dispatch_pfe: invalid host id");
> +             }
>               if (host->flags & F_DISABLE)
>                       break;
>               if (host->up == st.up) {
> @@ -2577,6 +2585,7 @@ relay_dispatch_parent(int fd, struct privsep_proc *p,
struct imsg *imsg)
>               break;
>       case IMSG_CFG_DONE:
>               config_getcfg(env, imsg);
> +             env->active = PROC_ACTIVE;
>               relay_launch();
>               break;
>       case IMSG_CTL_RESET:
> diff --git a/usr.sbin/relayd/relayd.h b/usr.sbin/relayd/relayd.h
> index 3d54045..3304c73 100644
> --- a/usr.sbin/relayd/relayd.h
> +++ b/usr.sbin/relayd/relayd.h
> @@ -869,8 +869,13 @@ struct relayd {
>
>       struct privsep          *sc_ps;
>       int                      sc_reload;
> +
> +     int                      active;
> };
>
> +#define PROC_INACTIVE 0x0
> +#define PROC_ACTIVE   0x1
> +
> #define RELAYD_OPT_VERBOSE            0x01
> #define RELAYD_OPT_NOACTION           0x04
> #define RELAYD_OPT_LOGUPDATE          0x08
>
>
> Any comments?
> /Benno

Reply via email to