Hi,

I've tested the attached patch with success and would like to have some feedback from other FreeBSD network developers. The problem is that the current TSO limitation only limits the number of bytes that can be transferred in a TSO packet and not the number of mbuf's.

The current solution is to have a quick and dirty custom m_dup() in the TX path to re-allocate the mbuf chains into 4K ones to make it simple. All of this hack can be avoided if the definition of the TSO limit can be changed a bit, like shown here:


 /*
+ * Structure defining hardware TSO limits.
+ */
+struct if_tso_limit {
+       u_int raw_value[0];     /* access all fields as one */
+       u_char frag_count;      /* maximum number of fragments: 1..255 */
+       u_char frag_size_log2;  /* maximum fragment size: 2 ** (12..16) */
+       u_char hdr_size_log2;   /* maximum header size: 2 ** (2..8) */
+       u_char reserved;        /* zero */
+};


First we need to know the maximum fragment count. Typical value is 32.
Second we need to know the maximum fragment size. Typical value is 4K.
Last we need to know of any headers that should be subtracted from the maximum. Hence this code is running in the fast path, I would like to use "u_char" for all fields and allow copy-only access as a "u_int" as an optimization. This avoids cludges and messing with additional header files.

I would like to push this patch after some more testing to -current and then to 10-stable hopefully before the coming 10-release, because the current solution is affecting performance of the Mellanox based network adapters in an unfair way. For example by setting the current TSO limit to 32KBytes which will be OK for all-2K fragments, we see a severe degradation in performance. Even though the hardware is fully capable of transmitting 16 4K mbufs.

Comments and reviews are welcome!

--HPS
=== sys/dev/oce/oce_if.c
==================================================================
--- sys/dev/oce/oce_if.c	(revision 270996)
+++ sys/dev/oce/oce_if.c	(local)
@@ -1731,7 +1731,9 @@
 	sc->ifp->if_baudrate = IF_Gbps(10);
 
 #if __FreeBSD_version >= 1000000
-	sc->ifp->if_hw_tsomax = OCE_MAX_TSO_SIZE;
+	sc->ifp->if_hw_tsomax.frag_count = 29;		/* 29 elements */
+	sc->ifp->if_hw_tsomax.frag_size_log2 = 12;	/* 4K */
+	sc->ifp->if_hw_tsomax.hdr_size_log2 = 5;	/* ETH+VLAN < 2**5 */
 #endif
 
 	ether_ifattach(sc->ifp, sc->macaddr.mac_addr);
=== sys/dev/oce/oce_if.h
==================================================================
--- sys/dev/oce/oce_if.h	(revision 270996)
+++ sys/dev/oce/oce_if.h	(local)
@@ -152,7 +152,6 @@
 #define OCE_MAX_TX_ELEMENTS		29
 #define OCE_MAX_TX_DESC			1024
 #define OCE_MAX_TX_SIZE			65535
-#define OCE_MAX_TSO_SIZE		(65535 - ETHER_HDR_LEN)
 #define OCE_MAX_RX_SIZE			4096
 #define OCE_MAX_RQ_POSTS		255
 #define OCE_DEFAULT_PROMISCUOUS		0
=== sys/dev/vmware/vmxnet3/if_vmx.c
==================================================================
--- sys/dev/vmware/vmxnet3/if_vmx.c	(revision 270996)
+++ sys/dev/vmware/vmxnet3/if_vmx.c	(local)
@@ -1722,7 +1722,9 @@
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_init = vmxnet3_init;
 	ifp->if_ioctl = vmxnet3_ioctl;
-	ifp->if_hw_tsomax = VMXNET3_TSO_MAXSIZE;
+	ifp->if_hw_tsomax.frag_count = VMXNET3_TX_MAXSEGS;
+	ifp->if_hw_tsomax.frag_size_log2 = VMXNET3_TX_MAXSEGSHIFT;
+	ifp->if_hw_tsomax.hdr_size_log2 = 5;	/* ETH+VLAN < 2**5 */
 
 #ifdef VMXNET3_LEGACY_TX
 	ifp->if_start = vmxnet3_start;
=== sys/dev/vmware/vmxnet3/if_vmxvar.h
==================================================================
--- sys/dev/vmware/vmxnet3/if_vmxvar.h	(revision 270996)
+++ sys/dev/vmware/vmxnet3/if_vmxvar.h	(local)
@@ -277,14 +277,13 @@
  */
 #define VMXNET3_TX_MAXSEGS		32
 #define VMXNET3_TX_MAXSIZE		(VMXNET3_TX_MAXSEGS * MCLBYTES)
-#define VMXNET3_TSO_MAXSIZE \
-    (VMXNET3_TX_MAXSIZE - sizeof(struct ether_vlan_header))
 
 /*
  * Maximum support Tx segments size. The length field in the
  * Tx descriptor is 14 bits.
  */
-#define VMXNET3_TX_MAXSEGSIZE		(1 << 14)
+#define VMXNET3_TX_MAXSEGSHIFT		14
+#define VMXNET3_TX_MAXSEGSIZE		(1 << VMXNET3_TX_MAXSEGSHIFT)
 
 /*
  * The maximum number of Rx segments we accept. When LRO is enabled,
=== sys/dev/xen/netfront/netfront.c
==================================================================
--- sys/dev/xen/netfront/netfront.c	(revision 270996)
+++ sys/dev/xen/netfront/netfront.c	(local)
@@ -134,7 +134,6 @@
  * to mirror the Linux MAX_SKB_FRAGS constant.
  */
 #define	MAX_TX_REQ_FRAGS (65536 / PAGE_SIZE + 2)
-#define	NF_TSO_MAXBURST ((IP_MAXPACKET / PAGE_SIZE) * MCLBYTES)
 
 #define RX_COPY_THRESHOLD 256
 
@@ -2102,7 +2101,9 @@
 	
     	ifp->if_hwassist = XN_CSUM_FEATURES;
     	ifp->if_capabilities = IFCAP_HWCSUM;
-	ifp->if_hw_tsomax = NF_TSO_MAXBURST;
+	ifp->if_hw_tsomax.frag_count = MAX_TX_REQ_FRAGS;
+	ifp->if_hw_tsomax.frag_size_log2 = PAGE_SHIFT;
+	ifp->if_hw_tsomax.hdr_size_log2 = 5;	/* ETH+VLAN < 2**5 */
 	
     	ether_ifattach(ifp, np->mac);
     	callout_init(&np->xn_stat_ch, CALLOUT_MPSAFE);
=== sys/net/if.c
==================================================================
--- sys/net/if.c	(revision 270996)
+++ sys/net/if.c	(local)
@@ -445,6 +445,7 @@
 	ifp->if_index = idx;
 	ifp->if_type = type;
 	ifp->if_alloctype = type;
+	ifp->if_hw_tsomax = IF_TSO_LIMIT_DEFAULT();
 	if (if_com_alloc[type] != NULL) {
 		ifp->if_l2com = if_com_alloc[type](type, ifp);
 		if (ifp->if_l2com == NULL) {
@@ -657,16 +658,6 @@
 		TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 		/* Reliably crash if used uninitialized. */
 		ifp->if_broadcastaddr = NULL;
-
-#if defined(INET) || defined(INET6)
-		/* Initialize to max value. */
-		if (ifp->if_hw_tsomax == 0)
-			ifp->if_hw_tsomax = min(IP_MAXPACKET, 32 * MCLBYTES -
-			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
-		KASSERT(ifp->if_hw_tsomax <= IP_MAXPACKET &&
-		    ifp->if_hw_tsomax >= IP_MAXPACKET / 8,
-		    ("%s: tsomax outside of range", __func__));
-#endif
 	}
 #ifdef VIMAGE
 	else {
=== sys/net/if_lagg.c
==================================================================
--- sys/net/if_lagg.c	(revision 270996)
+++ sys/net/if_lagg.c	(local)
@@ -445,11 +445,7 @@
 	struct lagg_port *lp;
 	int cap = ~0, ena = ~0;
 	u_long hwa = ~0UL;
-#if defined(INET) || defined(INET6)
-	u_int hw_tsomax = IP_MAXPACKET;	/* Initialize to the maximum value. */
-#else
-	u_int hw_tsomax = ~0;	/* if_hw_tsomax is only for INET/INET6, but.. */
-#endif
+	struct if_tso_limit hw_tsomax = IF_TSO_LIMIT_DEFAULT();
 
 	LAGG_WLOCK_ASSERT(sc);
 
@@ -458,10 +454,9 @@
 		cap &= lp->lp_ifp->if_capabilities;
 		ena &= lp->lp_ifp->if_capenable;
 		hwa &= lp->lp_ifp->if_hwassist;
-		/* Set to the minimum value of the lagg ports. */
-		if (lp->lp_ifp->if_hw_tsomax < hw_tsomax &&
-		    lp->lp_ifp->if_hw_tsomax > 0)
-			hw_tsomax = lp->lp_ifp->if_hw_tsomax;
+		/* Set to the common value of the lagg ports. */
+		hw_tsomax = IF_TSO_LIMIT_COMMON(&hw_tsomax,
+		    &lp->lp_ifp->if_hw_tsomax);
 	}
 	cap = (cap == ~0 ? 0 : cap);
 	ena = (ena == ~0 ? 0 : ena);
@@ -470,7 +465,7 @@
 	if (sc->sc_ifp->if_capabilities != cap ||
 	    sc->sc_ifp->if_capenable != ena ||
 	    sc->sc_ifp->if_hwassist != hwa ||
-	    sc->sc_ifp->if_hw_tsomax != hw_tsomax) {
+	    IF_TSO_LIMIT_CMP(&sc->sc_ifp->if_hw_tsomax, !=, &hw_tsomax)) {
 		sc->sc_ifp->if_capabilities = cap;
 		sc->sc_ifp->if_capenable = ena;
 		sc->sc_ifp->if_hwassist = hwa;
=== sys/net/if_var.h
==================================================================
--- sys/net/if_var.h	(revision 270996)
+++ sys/net/if_var.h	(local)
@@ -120,6 +120,36 @@
 typedef	uint64_t (*if_get_counter_t)(if_t, ifnet_counter);
 
 /*
+ * Structure defining hardware TSO limits.
+ */
+struct if_tso_limit {
+	u_int raw_value[0];	/* access all fields as one */
+	u_char frag_count;	/* maximum number of fragments: 1..255 */
+	u_char frag_size_log2;	/* maximum fragment size: 2 ** (12..16) */
+	u_char hdr_size_log2;	/* maximum header size: 2 ** (2..8) */
+	u_char reserved;	/* zero */
+};
+
+#define	IF_TSO_LIMIT_DEFAULT() ({		\
+struct if_tso_limit tso_temp = {		\
+  .frag_count = 128,				\
+  .frag_size_log2 = 16,				\
+  .hdr_size_log2 = 2,				\
+  .reserved = 0,				\
+}; tso_temp; })
+
+#define	IF_TSO_LIMIT_COMMON(a,b) ({				\
+struct if_tso_limit tso_temp = {				\
+  .frag_count = min((a)->frag_count, (b)->frag_count),		\
+  .frag_size_log2 = min((a)->frag_size_log2, (b)->frag_size_log2),\
+  .hdr_size_log2 = max((a)->hdr_size_log2, (b)->hdr_size_log2),	\
+  .reserved = 0,						\
+}; tso_temp; })
+
+#define	IF_TSO_LIMIT_CMP(a,op,b)		\
+  ((a)->raw_value[0] op (b)->raw_value[0])
+
+/*
  * Structure defining a network interface.
  *
  * Size ILP32:  592 (approx)
@@ -222,10 +252,8 @@
 	if_get_counter_t if_get_counter; /* get counter values */
 
 	/* Stuff that's only temporary and doesn't belong here. */
-	u_int	if_hw_tsomax;		/* tso burst length limit, the minimum
-					 * is (IP_MAXPACKET / 8).
-					 * XXXAO: Have to find a better place
-					 * for it eventually. */
+	struct if_tso_limit if_hw_tsomax;
+
 	/*
 	 * Old, racy and expensive statistics, should not be used in
 	 * new drivers.
=== sys/net/if_vlan.c
==================================================================
--- sys/net/if_vlan.c	(revision 270996)
+++ sys/net/if_vlan.c	(local)
@@ -1511,8 +1511,8 @@
 	 * propagate the hardware-assisted flag. TSO on VLANs
 	 * does not necessarily require hardware VLAN tagging.
 	 */
-	if (p->if_hw_tsomax > 0)
-		ifp->if_hw_tsomax = p->if_hw_tsomax;
+	ifp->if_hw_tsomax = IF_TSO_LIMIT_COMMON(&ifp->if_hw_tsomax,
+	    &p->if_hw_tsomax);
 	if (p->if_capabilities & IFCAP_VLAN_HWTSO)
 		ifp->if_capabilities |= p->if_capabilities & IFCAP_TSO;
 	if (p->if_capenable & IFCAP_VLAN_HWTSO) {
=== sys/netinet/tcp_output.c
==================================================================
--- sys/netinet/tcp_output.c	(revision 270996)
+++ sys/netinet/tcp_output.c	(local)
@@ -767,9 +767,70 @@
 		flags &= ~TH_FIN;
 
 		if (tso) {
+			struct if_tso_limit if_hw_tsomax;
+			struct mbuf *mb;
+			u_int rem_frags;
+			u_int moff;
+			int max_len;
+
+			/* copy TSO limit information */
+			if_hw_tsomax.raw_value[0] = tp->t_tsomax;
+
+			/* compute maximum TSO length */
+			max_len = (((u_int)if_hw_tsomax.frag_count) <<
+			    if_hw_tsomax.frag_size_log2) - hdrlen -
+			    (1 << if_hw_tsomax.hdr_size_log2);
+
+			/* clamp maximum length value */
+			if (max_len > IP_MAXPACKET)
+				max_len = IP_MAXPACKET;
+			else if (max_len < 0)
+				max_len = 0;
+
+			/* get smallest length */
+			if (len > (u_int)max_len) {
+				sendalot = 1;
+				len = (u_int)max_len;
+			}
+
+			/* get remaining fragments */
+			rem_frags = if_hw_tsomax.frag_count;
+
 			KASSERT(ipoptlen == 0,
 			    ("%s: TSO can't do IP options", __func__));
 
+			max_len = 0;
+			mb = sbsndptr(&so->so_snd, off, len, &moff);
+
+			/* now make sure the number of fragments fit too */
+			while (mb != NULL && (u_int)max_len < len) {
+				u_int cur_length;
+				u_int cur_frags;
+
+				/*
+				 * Get length of mbuf fragment and how
+				 * many hardware frags it would use:
+				 */
+				cur_length = (mb->m_len - moff);
+				cur_frags = (cur_length +
+				    (1 << if_hw_tsomax.frag_size_log2) - 1)
+				    >> if_hw_tsomax.frag_size_log2;
+
+				/* Handle special case: Zero Length Mbuf */
+				if (cur_frags == 0)
+					cur_frags = 1;
+
+				/* Check if fragment limit will be exceeded */
+				if (cur_frags >= rem_frags) {
+					max_len += min(cur_length, rem_frags << if_hw_tsomax.frag_size_log2);
+					break;
+				}
+				max_len += cur_length;
+				rem_frags -= cur_frags;
+				moff = 0;
+				mb = mb->m_next;
+			}
+
 			/*
 			 * Limit a burst to t_tsomax minus IP,
 			 * TCP and options length to keep ip->ip_len
@@ -776,8 +837,8 @@
 			 * from overflowing or exceeding the maximum
 			 * length allowed by the network interface.
 			 */
-			if (len > tp->t_tsomax - hdrlen) {
-				len = tp->t_tsomax - hdrlen;
+			if (len > (u_int)max_len) {
+				len = (u_int)max_len;
 				sendalot = 1;
 			}
 
=== sys/netinet/tcp_subr.c
==================================================================
--- sys/netinet/tcp_subr.c	(revision 270996)
+++ sys/netinet/tcp_subr.c	(local)
@@ -1818,7 +1818,7 @@
 			if (ifp->if_capenable & IFCAP_TSO4 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
-				cap->tsomax = ifp->if_hw_tsomax;
+				cap->tsomax = ifp->if_hw_tsomax.raw_value[0];
 			}
 		}
 		RTFREE(sro.ro_rt);
@@ -1857,7 +1857,7 @@
 			if (ifp->if_capenable & IFCAP_TSO6 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
-				cap->tsomax = ifp->if_hw_tsomax;
+				cap->tsomax = ifp->if_hw_tsomax.raw_value[0];
 			}
 		}
 		RTFREE(sro6.ro_rt);
_______________________________________________
freebsd-current@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"

Reply via email to