On Fri, May 22, 2015 at 09:18:29PM +0200, Alexander Bluhm wrote:
> This breaks relayd with TLS inspection. Moving down the "called
> once" check after the F_TLSINSPECT block fixes the plain SSL case.
> But HTTPS still hangs. I have just commited a test. Try
>
> cd /usr/src/regress/usr.sbin/relayd && make run-regress-args-https-inspect.pl
>
> bluhm
>
> On Fri, May 22, 2015 at 03:55:16PM +0200, Claudio Jeker wrote:
> > On our production systems we did hit the "relay_connect: no connection in
> > flight" on a so regular bases that I had to make it non-fatal with the
> > result of leaking sockets.
> >
> > After more investigation I found the problem to be a race against
> > connecting to the backend servers. In short:
> > - relay_read_http() will open a connection if following conditions are met
> > cre->dir == RELAY_DIR_REQUEST && cre->toread <= 0 && cre->dst->bev == NULL
> >
> > - relay_connect() does not initialize con->se_out.bev (which is also
> > cre->dst->bev). Instead this is deferred to relay_connected()
> >
> > - if a event happens that calls relay_read_http() while connecting to the
> > backend then relay_connect() will be called again. Result is the panic
> > since the count gets out of sync.
> >
> > The following diff solves this issue by adding an extra flag to
> > ctl_relay_event to know if a relay is already connected (or the connect is
> > pending). relay_close() will then clean the flag when closing the session.
> > I decided to use a flag since the EMFILE || ENFILE case is hard to detect
> > otherwise.
> >
> > Running with this on production with no visible issues at the moment.
> > I think it would make sense to restructure the http proxy code more and
> > introduce a proper state machine but that is a much bigger and complex
> > issue, so lets fix the bug first.
> >
New version that now passes all regress tests. It got a fair bit more
complex because the F_TLSINSPECT case results in multiple entries of
relay_connect and relay_connected. So I switched to use a state variable
instead of a flag. It seems to work (and also works in the EMFILE/ENFILE
case). At least it "worked" for me by forcing that code path all the time.
--
:wq Claudio
Index: relay.c
===================================================================
RCS file: /cvs/src/usr.sbin/relayd/relay.c,v
retrieving revision 1.194
diff -u -p -r1.194 relay.c
--- relay.c 18 May 2015 16:57:20 -0000 1.194
+++ relay.c 30 May 2015 11:12:27 -0000
@@ -1408,8 +1408,10 @@ relay_connect_retry(int fd, short sig, v
struct relay *rlay = con->se_relay;
int bnds = -1;
- if (relay_inflight < 1)
- fatalx("relay_connect_retry: no connection in flight");
+ if (relay_inflight < 1) {
+ log_warnx("relay_connect_retry: no connection in flight");
+ relay_inflight = 1;
+ }
DPRINTF("%s: retry %d of %d, inflight: %d",__func__,
con->se_retrycount, con->se_retry, relay_inflight);
@@ -1466,6 +1468,10 @@ relay_connect_retry(int fd, short sig, v
return;
}
+ if (rlay->rl_conf.flags & F_TLSINSPECT)
+ con->se_out.state = PRECONNECT;
+ else
+ con->se_out.state = CONNECTED;
relay_inflight--;
DPRINTF("%s: inflight decremented, now %d",__func__, relay_inflight);
@@ -1484,9 +1490,14 @@ relay_connect_retry(int fd, short sig, v
int
relay_preconnect(struct rsession *con)
{
+ int rv;
+
log_debug("%s: session %d: process %d", __func__,
con->se_id, privsep_process);
- return (relay_connect(con));
+ rv = relay_connect(con);
+ if (con->se_out.state == CONNECTED)
+ con->se_out.state = PRECONNECT;
+ return (rv);
}
int
@@ -1496,18 +1507,28 @@ relay_connect(struct rsession *con)
struct timeval evtpause = { 1, 0 };
int bnds = -1, ret;
+ /* relay_connect should only be called once per relay */
+ if (con->se_out.state == CONNECTED) {
+ log_debug("%s: connect already called once", __func__);
+ return (0);
+ }
+
/* Connection is already established but session not active */
- if ((rlay->rl_conf.flags & F_TLSINSPECT) && con->se_out.s != -1) {
+ if ((rlay->rl_conf.flags & F_TLSINSPECT) &&
+ con->se_out.state == PRECONNECT) {
if (con->se_out.ssl == NULL) {
log_debug("%s: tls connect failed", __func__);
return (-1);
}
relay_connected(con->se_out.s, EV_WRITE, con);
+ con->se_out.state = CONNECTED;
return (0);
}
- if (relay_inflight < 1)
- fatalx("relay_connect: no connection in flight");
+ if (relay_inflight < 1) {
+ log_warnx("relay_connect: no connection in flight");
+ relay_inflight = 1;
+ }
getmonotime(&con->se_tv_start);
@@ -1555,6 +1576,8 @@ relay_connect(struct rsession *con)
event_del(&rlay->rl_ev);
evtimer_add(&con->se_inflightevt, &evtpause);
evtimer_add(&rlay->rl_evt, &evtpause);
+ /* this connect is pending */
+ con->se_out.state = PENDING;
return (0);
} else {
if (con->se_retry) {
@@ -1572,6 +1595,7 @@ relay_connect(struct rsession *con)
}
}
+ con->se_out.state = CONNECTED;
relay_inflight--;
DPRINTF("%s: inflight decremented, now %d",__func__,
relay_inflight);
@@ -1673,6 +1697,7 @@ relay_close(struct rsession *con, const
event_add(&rlay->rl_ev, NULL);
}
}
+ con->se_out.state = INIT;
if (con->se_out.buf != NULL)
free(con->se_out.buf);
Index: relay_http.c
===================================================================
RCS file: /cvs/src/usr.sbin/relayd/relay_http.c,v
retrieving revision 1.47
diff -u -p -r1.47 relay_http.c
--- relay_http.c 22 May 2015 01:34:13 -0000 1.47
+++ relay_http.c 30 May 2015 11:12:53 -0000
@@ -275,8 +275,10 @@ relay_read_http(struct bufferevent *bev,
goto lookup;
} else if (cre->line == 1 && cre->dir == RELAY_DIR_REQUEST) {
if ((desc->http_method = relay_httpmethod_byname(key))
- == HTTP_METHOD_NONE)
+ == HTTP_METHOD_NONE) {
+ free(line);
goto fail;
+ }
/*
* Decode request path and query
*/
@@ -419,7 +421,7 @@ relay_read_http(struct bufferevent *bev,
relay_reset_http(cre);
done:
if (cre->dir == RELAY_DIR_REQUEST && cre->toread <= 0 &&
- cre->dst->bev == NULL) {
+ cre->dst->state != CONNECTED) {
if (rlay->rl_conf.fwdmode == FWD_TRANS) {
relay_bindanyreq(con, 0, IPPROTO_TCP);
return;
Index: relayd.h
===================================================================
RCS file: /cvs/src/usr.sbin/relayd/relayd.h,v
retrieving revision 1.209
diff -u -p -r1.209 relayd.h
--- relayd.h 2 May 2015 13:15:24 -0000 1.209
+++ relayd.h 30 May 2015 10:36:28 -0000
@@ -200,6 +200,7 @@ struct ctl_relay_event {
int line;
int done;
int timedout;
+ enum { INIT, PENDING, PRECONNECT, CONNECTED } state;
enum direction dir;
u_int8_t *buf;