On 01/03/15 10:05, Poul-Henning Kamp wrote:
> The problem with !SO_LINGER (we tried it some years back) is that
> all queued data is discarded the moment you call close(2), and we
> have no way to know if our last response is still lingering in the
> kernel's tcp-buffers and dribbling out to a slow client with a bad
> mobile phone connection.

I finally have understood the issue with http keepalive here and that kernels
behave differently than how I thought they would. Also I have now understood
that phk had already invested significant effort into this and I basically also
ended up "out-of-ideas".

Except for this: As long was we know that we cannot possibly have written
anything, we should we OK to close with RST.

The attached updated patch takes a simplistic approach to this.

Nils
>From a484814173f58da6e857051031167119a9954860 Mon Sep 17 00:00:00 2001
From: Nils Goroll <[email protected]>
Date: Mon, 2 Mar 2015 20:33:15 +0100
Subject: [PATCH] Instruct the kernel to reset the connection for SC_RX_TIMEOUT
 and others

(others being: SC_RX_BODY, SC_RX_OVERFLOW)

Closing TCP connections with SO_LINGER unset instructs the kernel to
attempt an orderly shutdown intending to finish a 3-way TCP FIN
sequence. This is not the optimal solution for cases where we have no
data to send to the client: A broken or malicious client may not
cooperate shutting down the connection and keeping a connection with
the varnish->client direction closed does not make any sense: If the
client was to send any data on its still-open direction, our kernel
would respond with RST anyway.

So we might as well instruct our kernel to close with RST in the first
place, with the potential additional benefit of freeing resources used
by our kernel and other stateful network equipment.

Punchline: closing with SO_LINGER set and a zero timeout (implicit in
VTCP_linger()), where applicable, should improve efficiency and
eliminate yet another potential DoS vector.

With HTTP keepalive connections, we need to consider the case that
read timeouts can occur when there is still data to be sent or in
transit to the peer (from a previous response). Ideally, we would like
to instruct the kernel to inform us when there is nothing left in the
send queue or, as phk put it >>a socketopt that says "close X seconds
after send-queue empty, provided nothing has been received."<<

We know no way of achieving this (on current Linux and FreeBSD kernels),
so we fall back to a simplistic approach: Close with VTCP_linger set
as long as we know we cannot possibly have written anything.

This patch introduces a simple requests-per-session counter which
may also have other uses in future.
---
 bin/varnishd/cache/cache.h         |  8 +++++++-
 bin/varnishd/cache/cache_panic.c   |  2 +-
 bin/varnishd/cache/cache_req_fsm.c |  1 +
 bin/varnishd/cache/cache_session.c | 13 +++++++++++++
 include/tbl/sess_close.h           | 39 ++++++++++++++++++++++++--------------
 5 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/bin/varnishd/cache/cache.h b/bin/varnishd/cache/cache.h
index fcb8c69..fca93bd 100644
--- a/bin/varnishd/cache/cache.h
+++ b/bin/varnishd/cache/cache.h
@@ -82,9 +82,14 @@ enum req_body_state_e {
 
 /*--------------------------------------------------------------------*/
 
+enum sess_close_action_e {
+	SC_ACT_FIN = 0,
+	SC_ACT_RST
+};
+
 enum sess_close {
 	SC_NULL = 0,
-#define SESS_CLOSE(nm, desc)	SC_##nm,
+#define SESS_CLOSE(nm, desc, act)	SC_##nm,
 #include "tbl/sess_close.h"
 #undef SESS_CLOSE
 };
@@ -693,6 +698,7 @@ struct sess {
 
 	struct vrt_privs	privs[1];
 
+	int			reqs;	/* # reqs seen on this session */
 };
 
 /* Prototypes etc ----------------------------------------------------*/
diff --git a/bin/varnishd/cache/cache_panic.c b/bin/varnishd/cache/cache_panic.c
index bc88148..9ac889f 100644
--- a/bin/varnishd/cache/cache_panic.c
+++ b/bin/varnishd/cache/cache_panic.c
@@ -102,7 +102,7 @@ sess_close_2str(enum sess_close sc, int want_desc)
 {
 	switch (sc) {
 	case SC_NULL:		return(want_desc ? "(null)": "NULL");
-#define SESS_CLOSE(nm, desc)	case SC_##nm: return(want_desc ? desc : #nm);
+#define SESS_CLOSE(nm, desc, act)	case SC_##nm: return(want_desc ? desc : #nm);
 #include "tbl/sess_close.h"
 #undef SESS_CLOSE
 
diff --git a/bin/varnishd/cache/cache_req_fsm.c b/bin/varnishd/cache/cache_req_fsm.c
index d78e24b..fd3138d 100644
--- a/bin/varnishd/cache/cache_req_fsm.c
+++ b/bin/varnishd/cache/cache_req_fsm.c
@@ -576,6 +576,7 @@ cnt_recv(struct worker *wrk, struct req *req)
 
 	VSLb(req->vsl, SLT_ReqStart, "%s %s",
 	    req->sp->client_addr_str, req->sp->client_port_str);
+	req->sp->reqs++;
 
 	http_VSL_log(req->http);
 
diff --git a/bin/varnishd/cache/cache_session.c b/bin/varnishd/cache/cache_session.c
index 7992834..b69c05f 100644
--- a/bin/varnishd/cache/cache_session.c
+++ b/bin/varnishd/cache/cache_session.c
@@ -211,6 +211,7 @@ SES_pool_accept_task(struct worker *wrk, void *arg)
 	sp->t_open = VTIM_real();
 	sp->t_idle = sp->t_open;
 	sp->vxid = VXID_Get(wrk, VSL_CLIENTMARKER);
+	sp->reqs = 0;
 
 	lsockname = VCA_SetupSess(wrk, sp);
 	ses_vsl_socket(sp, lsockname);
@@ -318,6 +319,18 @@ SES_Close(struct sess *sp, enum sess_close reason)
 
 	assert(sp->fd >= 0);
 	sp->reason = reason;
+	switch (reason) {
+#define SESS_CLOSE(nm, desc, act)					\
+		case SC_ ## nm:					\
+			if (act == SC_ACT_RST && sp->reqs == 0)	\
+				(void)VTCP_linger(sp->fd, 1);		\
+			break;
+#include "tbl/sess_close.h"
+#undef SESS_CLOSE
+	default:
+		WRONG("Wrong resason in SES_Close");
+	}
+
 	i = close(sp->fd);
 	assert(i == 0 || errno != EBADF); /* XXX EINVAL seen */
 	sp->fd = -1;
diff --git a/include/tbl/sess_close.h b/include/tbl/sess_close.h
index 8930b74..be7a64a 100644
--- a/include/tbl/sess_close.h
+++ b/include/tbl/sess_close.h
@@ -25,23 +25,34 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
+ *
+ * Note on the close action:
+ *
+ * SC_ACT_FIN: We intend to terminate the connection gracefully, preferrably
+ *	       flushing our response with a TCP 3way FIN handshake
+ *
+ * SC_ACT_RST: We intend to terminate as efficiently as possible, signalling
+ *	       to the peer that we will not continue reading any data,
+ *	       preferrably with a TCP RST
+ *
+ * keep RX_JUNK and RX_BAD on SC_ACT_FIN as long as we generate a 400 response
  */
 
 /*lint -save -e525 -e539 */
 
-SESS_CLOSE(REM_CLOSE,		"Client Closed")
-SESS_CLOSE(REQ_CLOSE,		"Client requested close")
-SESS_CLOSE(REQ_HTTP10,		"Proto < HTTP/1.1")
-SESS_CLOSE(RX_BAD,		"Received bad req/resp")
-SESS_CLOSE(RX_BODY,		"Failure receiving req.body")
-SESS_CLOSE(RX_JUNK,		"Received junk data")
-SESS_CLOSE(RX_OVERFLOW,		"Received buffer overflow")
-SESS_CLOSE(RX_TIMEOUT,		"Receive timeout")
-SESS_CLOSE(TX_PIPE,		"Piped transaction")
-SESS_CLOSE(TX_ERROR,		"Error transaction")
-SESS_CLOSE(TX_EOF,		"EOF transmission")
-SESS_CLOSE(RESP_CLOSE,		"Backend/VCL requested close")
-SESS_CLOSE(OVERLOAD,		"Out of some resource")
-SESS_CLOSE(SESS_PIPE_OVERFLOW,	"Session pipe overflow")
+SESS_CLOSE(REM_CLOSE,		"Client Closed",		SC_ACT_FIN)
+SESS_CLOSE(REQ_CLOSE,		"Client requested close",	SC_ACT_FIN)
+SESS_CLOSE(REQ_HTTP10,		"Proto < HTTP/1.1",		SC_ACT_FIN)
+SESS_CLOSE(RX_BAD,		"Received bad req/resp",	SC_ACT_FIN)
+SESS_CLOSE(RX_BODY,		"Failure receiving req.body",	SC_ACT_RST)
+SESS_CLOSE(RX_JUNK,		"Received junk data",		SC_ACT_FIN)
+SESS_CLOSE(RX_OVERFLOW,	"Received buffer overflow",	SC_ACT_RST)
+SESS_CLOSE(RX_TIMEOUT,		"Receive timeout",		SC_ACT_RST)
+SESS_CLOSE(TX_PIPE,		"Piped transaction",		SC_ACT_FIN)
+SESS_CLOSE(TX_ERROR,		"Error transaction",		SC_ACT_FIN)
+SESS_CLOSE(TX_EOF,		"EOF transmission",		SC_ACT_FIN)
+SESS_CLOSE(RESP_CLOSE,		"Backend/VCL requested close",	SC_ACT_FIN)
+SESS_CLOSE(OVERLOAD,		"Out of some resource",	SC_ACT_FIN)
+SESS_CLOSE(SESS_PIPE_OVERFLOW,	"Session pipe overflow",	SC_ACT_FIN)
 
 /*lint -restore */
-- 
2.1.4

_______________________________________________
varnish-dev mailing list
[email protected]
https://www.varnish-cache.org/lists/mailman/listinfo/varnish-dev

Reply via email to