On 01/03/15 10:05, Poul-Henning Kamp wrote: > The problem with !SO_LINGER (we tried it some years back) is that > all queued data is discarded the moment you call close(2), and we > have no way to know if our last response is still lingering in the > kernel's tcp-buffers and dribbling out to a slow client with a bad > mobile phone connection.
I finally have understood the issue with http keepalive here and that kernels behave differently than how I thought they would. Also I have now understood that phk had already invested significant effort into this and I basically also ended up "out-of-ideas". Except for this: As long was we know that we cannot possibly have written anything, we should we OK to close with RST. The attached updated patch takes a simplistic approach to this. Nils
>From a484814173f58da6e857051031167119a9954860 Mon Sep 17 00:00:00 2001 From: Nils Goroll <[email protected]> Date: Mon, 2 Mar 2015 20:33:15 +0100 Subject: [PATCH] Instruct the kernel to reset the connection for SC_RX_TIMEOUT and others (others being: SC_RX_BODY, SC_RX_OVERFLOW) Closing TCP connections with SO_LINGER unset instructs the kernel to attempt an orderly shutdown intending to finish a 3-way TCP FIN sequence. This is not the optimal solution for cases where we have no data to send to the client: A broken or malicious client may not cooperate shutting down the connection and keeping a connection with the varnish->client direction closed does not make any sense: If the client was to send any data on its still-open direction, our kernel would respond with RST anyway. So we might as well instruct our kernel to close with RST in the first place, with the potential additional benefit of freeing resources used by our kernel and other stateful network equipment. Punchline: closing with SO_LINGER set and a zero timeout (implicit in VTCP_linger()), where applicable, should improve efficiency and eliminate yet another potential DoS vector. With HTTP keepalive connections, we need to consider the case that read timeouts can occur when there is still data to be sent or in transit to the peer (from a previous response). Ideally, we would like to instruct the kernel to inform us when there is nothing left in the send queue or, as phk put it >>a socketopt that says "close X seconds after send-queue empty, provided nothing has been received."<< We know no way of achieving this (on current Linux and FreeBSD kernels), so we fall back to a simplistic approach: Close with VTCP_linger set as long as we know we cannot possibly have written anything. This patch introduces a simple requests-per-session counter which may also have other uses in future. --- bin/varnishd/cache/cache.h | 8 +++++++- bin/varnishd/cache/cache_panic.c | 2 +- bin/varnishd/cache/cache_req_fsm.c | 1 + bin/varnishd/cache/cache_session.c | 13 +++++++++++++ include/tbl/sess_close.h | 39 ++++++++++++++++++++++++-------------- 5 files changed, 47 insertions(+), 16 deletions(-) diff --git a/bin/varnishd/cache/cache.h b/bin/varnishd/cache/cache.h index fcb8c69..fca93bd 100644 --- a/bin/varnishd/cache/cache.h +++ b/bin/varnishd/cache/cache.h @@ -82,9 +82,14 @@ enum req_body_state_e { /*--------------------------------------------------------------------*/ +enum sess_close_action_e { + SC_ACT_FIN = 0, + SC_ACT_RST +}; + enum sess_close { SC_NULL = 0, -#define SESS_CLOSE(nm, desc) SC_##nm, +#define SESS_CLOSE(nm, desc, act) SC_##nm, #include "tbl/sess_close.h" #undef SESS_CLOSE }; @@ -693,6 +698,7 @@ struct sess { struct vrt_privs privs[1]; + int reqs; /* # reqs seen on this session */ }; /* Prototypes etc ----------------------------------------------------*/ diff --git a/bin/varnishd/cache/cache_panic.c b/bin/varnishd/cache/cache_panic.c index bc88148..9ac889f 100644 --- a/bin/varnishd/cache/cache_panic.c +++ b/bin/varnishd/cache/cache_panic.c @@ -102,7 +102,7 @@ sess_close_2str(enum sess_close sc, int want_desc) { switch (sc) { case SC_NULL: return(want_desc ? "(null)": "NULL"); -#define SESS_CLOSE(nm, desc) case SC_##nm: return(want_desc ? desc : #nm); +#define SESS_CLOSE(nm, desc, act) case SC_##nm: return(want_desc ? desc : #nm); #include "tbl/sess_close.h" #undef SESS_CLOSE diff --git a/bin/varnishd/cache/cache_req_fsm.c b/bin/varnishd/cache/cache_req_fsm.c index d78e24b..fd3138d 100644 --- a/bin/varnishd/cache/cache_req_fsm.c +++ b/bin/varnishd/cache/cache_req_fsm.c @@ -576,6 +576,7 @@ cnt_recv(struct worker *wrk, struct req *req) VSLb(req->vsl, SLT_ReqStart, "%s %s", req->sp->client_addr_str, req->sp->client_port_str); + req->sp->reqs++; http_VSL_log(req->http); diff --git a/bin/varnishd/cache/cache_session.c b/bin/varnishd/cache/cache_session.c index 7992834..b69c05f 100644 --- a/bin/varnishd/cache/cache_session.c +++ b/bin/varnishd/cache/cache_session.c @@ -211,6 +211,7 @@ SES_pool_accept_task(struct worker *wrk, void *arg) sp->t_open = VTIM_real(); sp->t_idle = sp->t_open; sp->vxid = VXID_Get(wrk, VSL_CLIENTMARKER); + sp->reqs = 0; lsockname = VCA_SetupSess(wrk, sp); ses_vsl_socket(sp, lsockname); @@ -318,6 +319,18 @@ SES_Close(struct sess *sp, enum sess_close reason) assert(sp->fd >= 0); sp->reason = reason; + switch (reason) { +#define SESS_CLOSE(nm, desc, act) \ + case SC_ ## nm: \ + if (act == SC_ACT_RST && sp->reqs == 0) \ + (void)VTCP_linger(sp->fd, 1); \ + break; +#include "tbl/sess_close.h" +#undef SESS_CLOSE + default: + WRONG("Wrong resason in SES_Close"); + } + i = close(sp->fd); assert(i == 0 || errno != EBADF); /* XXX EINVAL seen */ sp->fd = -1; diff --git a/include/tbl/sess_close.h b/include/tbl/sess_close.h index 8930b74..be7a64a 100644 --- a/include/tbl/sess_close.h +++ b/include/tbl/sess_close.h @@ -25,23 +25,34 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * + * + * Note on the close action: + * + * SC_ACT_FIN: We intend to terminate the connection gracefully, preferrably + * flushing our response with a TCP 3way FIN handshake + * + * SC_ACT_RST: We intend to terminate as efficiently as possible, signalling + * to the peer that we will not continue reading any data, + * preferrably with a TCP RST + * + * keep RX_JUNK and RX_BAD on SC_ACT_FIN as long as we generate a 400 response */ /*lint -save -e525 -e539 */ -SESS_CLOSE(REM_CLOSE, "Client Closed") -SESS_CLOSE(REQ_CLOSE, "Client requested close") -SESS_CLOSE(REQ_HTTP10, "Proto < HTTP/1.1") -SESS_CLOSE(RX_BAD, "Received bad req/resp") -SESS_CLOSE(RX_BODY, "Failure receiving req.body") -SESS_CLOSE(RX_JUNK, "Received junk data") -SESS_CLOSE(RX_OVERFLOW, "Received buffer overflow") -SESS_CLOSE(RX_TIMEOUT, "Receive timeout") -SESS_CLOSE(TX_PIPE, "Piped transaction") -SESS_CLOSE(TX_ERROR, "Error transaction") -SESS_CLOSE(TX_EOF, "EOF transmission") -SESS_CLOSE(RESP_CLOSE, "Backend/VCL requested close") -SESS_CLOSE(OVERLOAD, "Out of some resource") -SESS_CLOSE(SESS_PIPE_OVERFLOW, "Session pipe overflow") +SESS_CLOSE(REM_CLOSE, "Client Closed", SC_ACT_FIN) +SESS_CLOSE(REQ_CLOSE, "Client requested close", SC_ACT_FIN) +SESS_CLOSE(REQ_HTTP10, "Proto < HTTP/1.1", SC_ACT_FIN) +SESS_CLOSE(RX_BAD, "Received bad req/resp", SC_ACT_FIN) +SESS_CLOSE(RX_BODY, "Failure receiving req.body", SC_ACT_RST) +SESS_CLOSE(RX_JUNK, "Received junk data", SC_ACT_FIN) +SESS_CLOSE(RX_OVERFLOW, "Received buffer overflow", SC_ACT_RST) +SESS_CLOSE(RX_TIMEOUT, "Receive timeout", SC_ACT_RST) +SESS_CLOSE(TX_PIPE, "Piped transaction", SC_ACT_FIN) +SESS_CLOSE(TX_ERROR, "Error transaction", SC_ACT_FIN) +SESS_CLOSE(TX_EOF, "EOF transmission", SC_ACT_FIN) +SESS_CLOSE(RESP_CLOSE, "Backend/VCL requested close", SC_ACT_FIN) +SESS_CLOSE(OVERLOAD, "Out of some resource", SC_ACT_FIN) +SESS_CLOSE(SESS_PIPE_OVERFLOW, "Session pipe overflow", SC_ACT_FIN) /*lint -restore */ -- 2.1.4
_______________________________________________ varnish-dev mailing list [email protected] https://www.varnish-cache.org/lists/mailman/listinfo/varnish-dev
