On Fri, May 22, 2015 at 09:18:29PM +0200, Alexander Bluhm wrote:
> This breaks relayd with TLS inspection.  Moving down the "called
> once" check after the F_TLSINSPECT block fixes the plain SSL case.
> But HTTPS still hangs.  I have just commited a test.  Try
> 
> cd /usr/src/regress/usr.sbin/relayd && make run-regress-args-https-inspect.pl
> 
> bluhm
> 
> On Fri, May 22, 2015 at 03:55:16PM +0200, Claudio Jeker wrote:
> > On our production systems we did hit the "relay_connect: no connection in
> > flight" on a so regular bases that I had to make it non-fatal with the
> > result of leaking sockets.
> > 
> > After more investigation I found the problem to be a race against
> > connecting to the backend servers. In short:
> > - relay_read_http() will open a connection if following conditions are met
> >   cre->dir == RELAY_DIR_REQUEST && cre->toread <= 0 && cre->dst->bev == NULL
> > 
> > - relay_connect() does not initialize con->se_out.bev (which is also
> >   cre->dst->bev). Instead this is deferred to relay_connected()
> > 
> > - if a event happens that calls relay_read_http() while connecting to the
> >   backend then relay_connect() will be called again. Result is the panic
> >   since the count gets out of sync.
> > 
> > The following diff solves this issue by adding an extra flag to
> > ctl_relay_event to know if a relay is already connected (or the connect is
> > pending). relay_close() will then clean the flag when closing the session.
> > I decided to use a flag since the EMFILE || ENFILE case is hard to detect
> > otherwise.
> > 
> > Running with this on production with no visible issues at the moment.
> > I think it would make sense to restructure the http proxy code more and
> > introduce a proper state machine but that is a much bigger and complex
> > issue, so lets fix the bug first.
> > 

New version that now passes all regress tests. It got a fair bit more
complex because the F_TLSINSPECT case results in multiple entries of
relay_connect and relay_connected. So I switched to use a state variable
instead of a flag. It seems to work (and also works in the EMFILE/ENFILE
case). At least it "worked" for me by forcing that code path all the time.

-- 
:wq Claudio

Index: relay.c
===================================================================
RCS file: /cvs/src/usr.sbin/relayd/relay.c,v
retrieving revision 1.194
diff -u -p -r1.194 relay.c
--- relay.c     18 May 2015 16:57:20 -0000      1.194
+++ relay.c     30 May 2015 11:12:27 -0000
@@ -1408,8 +1408,10 @@ relay_connect_retry(int fd, short sig, v
        struct relay    *rlay = con->se_relay;
        int              bnds = -1;
 
-       if (relay_inflight < 1)
-               fatalx("relay_connect_retry: no connection in flight");
+       if (relay_inflight < 1) {
+               log_warnx("relay_connect_retry: no connection in flight");
+               relay_inflight = 1;
+       }
 
        DPRINTF("%s: retry %d of %d, inflight: %d",__func__,
            con->se_retrycount, con->se_retry, relay_inflight);
@@ -1466,6 +1468,10 @@ relay_connect_retry(int fd, short sig, v
                return;
        }
 
+       if (rlay->rl_conf.flags & F_TLSINSPECT)
+               con->se_out.state = PRECONNECT;
+       else
+               con->se_out.state = CONNECTED;
        relay_inflight--;
        DPRINTF("%s: inflight decremented, now %d",__func__, relay_inflight);
 
@@ -1484,9 +1490,14 @@ relay_connect_retry(int fd, short sig, v
 int
 relay_preconnect(struct rsession *con)
 {
+       int rv;
+
        log_debug("%s: session %d: process %d", __func__,
            con->se_id, privsep_process);
-       return (relay_connect(con));
+       rv = relay_connect(con);
+       if (con->se_out.state == CONNECTED)
+               con->se_out.state = PRECONNECT;
+       return (rv);
 }
 
 int
@@ -1496,18 +1507,28 @@ relay_connect(struct rsession *con)
        struct timeval   evtpause = { 1, 0 };
        int              bnds = -1, ret;
 
+       /* relay_connect should only be called once per relay */
+       if (con->se_out.state == CONNECTED) {
+               log_debug("%s: connect already called once", __func__);
+               return (0);
+       }
+
        /* Connection is already established but session not active */
-       if ((rlay->rl_conf.flags & F_TLSINSPECT) && con->se_out.s != -1) {
+       if ((rlay->rl_conf.flags & F_TLSINSPECT) &&
+           con->se_out.state == PRECONNECT) {
                if (con->se_out.ssl == NULL) {
                        log_debug("%s: tls connect failed", __func__);
                        return (-1);
                }
                relay_connected(con->se_out.s, EV_WRITE, con);
+               con->se_out.state = CONNECTED;
                return (0);
        }
 
-       if (relay_inflight < 1)
-               fatalx("relay_connect: no connection in flight");
+       if (relay_inflight < 1) {
+               log_warnx("relay_connect: no connection in flight");
+               relay_inflight = 1;
+       }
 
        getmonotime(&con->se_tv_start);
 
@@ -1555,6 +1576,8 @@ relay_connect(struct rsession *con)
                        event_del(&rlay->rl_ev);
                        evtimer_add(&con->se_inflightevt, &evtpause);
                        evtimer_add(&rlay->rl_evt, &evtpause);
+                       /* this connect is pending */
+                       con->se_out.state = PENDING;
                        return (0);
                } else {
                        if (con->se_retry) {
@@ -1572,6 +1595,7 @@ relay_connect(struct rsession *con)
                }
        }
 
+       con->se_out.state = CONNECTED;
        relay_inflight--;
        DPRINTF("%s: inflight decremented, now %d",__func__,
            relay_inflight);
@@ -1673,6 +1697,7 @@ relay_close(struct rsession *con, const 
                        event_add(&rlay->rl_ev, NULL);
                }
        }
+       con->se_out.state = INIT;
 
        if (con->se_out.buf != NULL)
                free(con->se_out.buf);
Index: relay_http.c
===================================================================
RCS file: /cvs/src/usr.sbin/relayd/relay_http.c,v
retrieving revision 1.47
diff -u -p -r1.47 relay_http.c
--- relay_http.c        22 May 2015 01:34:13 -0000      1.47
+++ relay_http.c        30 May 2015 11:12:53 -0000
@@ -275,8 +275,10 @@ relay_read_http(struct bufferevent *bev,
                        goto lookup;
                } else if (cre->line == 1 && cre->dir == RELAY_DIR_REQUEST) {
                        if ((desc->http_method = relay_httpmethod_byname(key))
-                           == HTTP_METHOD_NONE)
+                           == HTTP_METHOD_NONE) {
+                               free(line);
                                goto fail;
+                       }
                        /*
                         * Decode request path and query
                         */
@@ -419,7 +421,7 @@ relay_read_http(struct bufferevent *bev,
                relay_reset_http(cre);
  done:
                if (cre->dir == RELAY_DIR_REQUEST && cre->toread <= 0 &&
-                   cre->dst->bev == NULL) {
+                   cre->dst->state != CONNECTED) {
                        if (rlay->rl_conf.fwdmode == FWD_TRANS) {
                                relay_bindanyreq(con, 0, IPPROTO_TCP);
                                return;
Index: relayd.h
===================================================================
RCS file: /cvs/src/usr.sbin/relayd/relayd.h,v
retrieving revision 1.209
diff -u -p -r1.209 relayd.h
--- relayd.h    2 May 2015 13:15:24 -0000       1.209
+++ relayd.h    30 May 2015 10:36:28 -0000
@@ -200,6 +200,7 @@ struct ctl_relay_event {
        int                      line;
        int                      done;
        int                      timedout;
+       enum { INIT, PENDING, PRECONNECT, CONNECTED } state;
        enum direction           dir;
 
        u_int8_t                *buf;

Reply via email to