sepherosa_gmail.com created this revision.
sepherosa_gmail.com added reviewers: network, adrian, delphij, royger, 
decui_microsoft.com, honzhan_microsoft.com, howard0su_gmail.com, gallatin, 
hselasky, np.
sepherosa_gmail.com added subscribers: freebsd-net-list, 
freebsd-virtualization-list.
Herald added a reviewer: transport.

REVISION SUMMARY
  It's append_cnt based.  Unless the network driver sets these two limits, its 
an NO-OP.
  
  For hn(4):
  
  - Set TCP ACK append limit to 1, i.e. aggregate 2 ACKs at most.  Aggregate 
anything more than 2 hurts TCP sending performance in hyperv.  This 
significantly improves the TCP sending performance when the number of 
concurrent connetion is low (2~8).  And greatly stabilize the TCP sending 
performance in other cases.
  - Set TCP data segments append limit to 25.  Without this limitation, hn(4) 
could aggregate ~45 TCP data segments for each connection (even at 64 or more 
connections) before dispatching them to socket code; large aggregation slows 
down ACK sending and eventually hurts/destabilizes TCP reception performance.  
This setting stabilizes and improves TCP reception performance for >4 
concurrent connections significantly.
  
  Make them sysctls so they could be adjusted.

REVISION DETAIL
  https://reviews.freebsd.org/D5185

AFFECTED FILES
  sys/dev/hyperv/netvsc/hv_net_vsc.h
  sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
  sys/netinet/tcp_lro.c
  sys/netinet/tcp_lro.h

EMAIL PREFERENCES
  https://reviews.freebsd.org/settings/panel/emailpreferences/

To: sepherosa_gmail.com, network, transport, adrian, delphij, royger, 
decui_microsoft.com, honzhan_microsoft.com, howard0su_gmail.com, gallatin, 
hselasky, np
Cc: freebsd-virtualization-list, freebsd-net-list
diff --git a/sys/netinet/tcp_lro.h b/sys/netinet/tcp_lro.h
--- a/sys/netinet/tcp_lro.h
+++ b/sys/netinet/tcp_lro.h
@@ -91,6 +91,8 @@
 	unsigned	lro_cnt;
 	unsigned	lro_mbuf_count;
 	unsigned	lro_mbuf_max;
+	unsigned short	lro_ack_append_lim;
+	unsigned short	lro_data_append_lim;
 
 	struct lro_head	lro_active;
 	struct lro_head	lro_free;
diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c
--- a/sys/netinet/tcp_lro.c
+++ b/sys/netinet/tcp_lro.c
@@ -88,6 +88,8 @@
 	lc->lro_mbuf_max = lro_mbufs;
 	lc->lro_cnt = lro_entries;
 	lc->ifp = ifp;
+	lc->lro_ack_append_lim = 0;
+	lc->lro_data_append_lim = 0;
 	SLIST_INIT(&lc->lro_free);
 	SLIST_INIT(&lc->lro_active);
 
@@ -646,6 +648,16 @@
 
 		if (tcp_data_len == 0) {
 			m_freem(m);
+			/*
+			 * Flush this LRO entry, if this ACK should
+			 * not be further delayed.
+			 */
+			if (lc->lro_ack_append_lim &&
+			    le->append_cnt >= lc->lro_ack_append_lim) {
+				SLIST_REMOVE(&lc->lro_active, le, lro_entry,
+				    next);
+				tcp_lro_flush(lc, le);
+			}
 			return (0);
 		}
 
@@ -664,9 +676,12 @@
 
 		/*
 		 * If a possible next full length packet would cause an
-		 * overflow, pro-actively flush now.
+		 * overflow, pro-actively flush now.  And if we are asked
+		 * to limit the data aggregate, flush this LRO entry now.
 		 */
-		if (le->p_len > (65535 - lc->ifp->if_mtu)) {
+		if (le->p_len > (65535 - lc->ifp->if_mtu) ||
+		    (lc->lro_data_append_lim &&
+		     le->append_cnt >= lc->lro_data_append_lim)) {
 			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
 			tcp_lro_flush(lc, le);
 		} else
diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
--- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
@@ -176,14 +176,8 @@
 #define HN_CSUM_ASSIST_WIN8	(CSUM_TCP)
 #define HN_CSUM_ASSIST		(CSUM_IP | CSUM_UDP | CSUM_TCP)
 
-/* XXX move to netinet/tcp_lro.h */
-#define HN_LRO_HIWAT_MAX				65535
-#define HN_LRO_HIWAT_DEF				HN_LRO_HIWAT_MAX
-/* YYY 2*MTU is a bit rough, but should be good enough. */
-#define HN_LRO_HIWAT_MTULIM(ifp)			(2 * (ifp)->if_mtu)
-#define HN_LRO_HIWAT_ISVALID(sc, hiwat)			\
-    ((hiwat) >= HN_LRO_HIWAT_MTULIM((sc)->hn_ifp) ||	\
-     (hiwat) <= HN_LRO_HIWAT_MAX)
+#define HN_LRO_ACK_APPEND_LIM	1
+#define HN_LRO_DATA_APPEND_LIM	25
 
 /*
  * Be aware that this sleepable mutex will exhibit WITNESS errors when
@@ -253,27 +247,16 @@
 static void hn_start_txeof(struct ifnet *ifp);
 static int hn_ifmedia_upd(struct ifnet *ifp);
 static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
-#ifdef HN_LRO_HIWAT
-static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS);
-#endif
 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_lro_append_lim_sysctl(SYSCTL_HANDLER_ARGS);
 static int hn_check_iplen(const struct mbuf *, int);
 static int hn_create_tx_ring(struct hn_softc *sc);
 static void hn_destroy_tx_ring(struct hn_softc *sc);
 static void hn_start_taskfunc(void *xsc, int pending);
 static void hn_txeof_taskfunc(void *xsc, int pending);
 static int hn_encap(struct hn_softc *, struct hn_txdesc *, struct mbuf **);
 
-static __inline void
-hn_set_lro_hiwat(struct hn_softc *sc, int hiwat)
-{
-	sc->hn_lro_hiwat = hiwat;
-#ifdef HN_LRO_HIWAT
-	sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
-#endif
-}
-
 static int
 hn_ifmedia_upd(struct ifnet *ifp __unused)
 {
@@ -358,7 +341,6 @@
 	bzero(sc, sizeof(hn_softc_t));
 	sc->hn_unit = unit;
 	sc->hn_dev = dev;
-	sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF;
 	sc->hn_direct_tx_size = hn_direct_tx_size;
 	if (hn_trust_hosttcp)
 		sc->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
@@ -442,9 +424,8 @@
 	/* Driver private LRO settings */
 	sc->hn_lro.ifp = ifp;
 #endif
-#ifdef HN_LRO_HIWAT
-	sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
-#endif
+	sc->hn_lro.lro_ack_append_lim = HN_LRO_ACK_APPEND_LIM;
+	sc->hn_lro.lro_data_append_lim = HN_LRO_DATA_APPEND_LIM;
 #endif	/* INET || INET6 */
 
 #if __FreeBSD_version >= 1100045
@@ -471,6 +452,13 @@
 	    hn_tx_chimney_size < sc->hn_tx_chimney_max)
 		sc->hn_tx_chimney_size = hn_tx_chimney_size;
 
+	/*
+	 * Always schedule transmission instead of trying
+	 * to do direct transmission.  This one gives the
+	 * best performance so far.
+	 */
+	sc->hn_sched_tx = 1;
+
 	ctx = device_get_sysctl_ctx(dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 
@@ -480,11 +468,6 @@
 	    CTLFLAG_RW, &sc->hn_lro.lro_flushed, 0, "LRO flushed");
 	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "lro_tried",
 	    CTLFLAG_RW, &sc->hn_lro_tried, "# of LRO tries");
-#ifdef HN_LRO_HIWAT
-	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_hiwat",
-	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_hiwat_sysctl,
-	    "I", "LRO high watermark");
-#endif
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
 	    CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_TCP,
 	    hn_trust_hcsum_sysctl, "I",
@@ -538,6 +521,14 @@
 	    CTLFLAG_RW, &sc->hn_sched_tx, 0,
 	    "Always schedule transmission "
 	    "instead of doing direct transmission");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ack_append_lim",
+	    CTLTYPE_INT | CTLFLAG_RW, &sc->hn_lro.lro_ack_append_lim,
+	    0, hn_lro_append_lim_sysctl,
+	    "I", "LRO ACK appending limitation");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_data_append_lim",
+	    CTLTYPE_INT | CTLFLAG_RW, &sc->hn_lro.lro_data_append_lim,
+	    0, hn_lro_append_lim_sysctl,
+	    "I", "LRO data segments appending limitation");
 
 	if (unit == 0) {
 		struct sysctl_ctx_list *dc_ctx;
@@ -1410,13 +1401,6 @@
 
 		/* Obtain and record requested MTU */
 		ifp->if_mtu = ifr->ifr_mtu;
-		/*
-		 * Make sure that LRO high watermark is still valid,
-		 * after MTU change (the 2*MTU limit).
-		 */
-		if (!HN_LRO_HIWAT_ISVALID(sc, sc->hn_lro_hiwat))
-			hn_set_lro_hiwat(sc, HN_LRO_HIWAT_MTULIM(ifp));
-
 		do {
 			NV_LOCK(sc);
 			if (!sc->temp_unusable) {
@@ -1722,27 +1706,6 @@
 }
 #endif
 
-#ifdef HN_LRO_HIWAT
-static int
-hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS)
-{
-	struct hn_softc *sc = arg1;
-	int hiwat, error;
-
-	hiwat = sc->hn_lro_hiwat;
-	error = sysctl_handle_int(oidp, &hiwat, 0, req);
-	if (error || req->newptr == NULL)
-		return error;
-
-	if (!HN_LRO_HIWAT_ISVALID(sc, hiwat))
-		return EINVAL;
-
-	if (sc->hn_lro_hiwat != hiwat)
-		hn_set_lro_hiwat(sc, hiwat);
-	return 0;
-}
-#endif	/* HN_LRO_HIWAT */
-
 static int
 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
 {
@@ -1787,6 +1750,24 @@
 }
 
 static int
+hn_lro_append_lim_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	unsigned short *lro_lim = arg1;
+	int lim, error;
+
+	lim = *lro_lim;
+	error = sysctl_handle_int(oidp, &lim, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	if (lim < 0 || lim > 65535)
+		return EINVAL;
+
+	*lro_lim = lim;
+	return 0;
+}
+
+static int
 hn_check_iplen(const struct mbuf *m, int hoff)
 {
 	const struct ip *ip;
diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.h b/sys/dev/hyperv/netvsc/hv_net_vsc.h
--- a/sys/dev/hyperv/netvsc/hv_net_vsc.h
+++ b/sys/dev/hyperv/netvsc/hv_net_vsc.h
@@ -1030,7 +1030,6 @@
 	struct task	hn_txeof_task;
 
 	struct lro_ctrl	hn_lro;
-	int		hn_lro_hiwat;
 
 	/* Trust csum verification on host side */
 	int		hn_trust_hcsum;	/* HN_TRUST_HCSUM_ */

_______________________________________________
freebsd-virtualization@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-virtualization
To unsubscribe, send any mail to 
"freebsd-virtualization-unsubscr...@freebsd.org"

Reply via email to