So far I've completely ignored offloads in the ethernet drivers I've
written, but on having a quick look at the documentation I found that
mcx(4) checksum offload is extremely easy to use, and some simple testing
suggests that it helps quite a bit. I've seen tcpbench receive throughput
increase by around 15%.
The nic supports all the checksum offloads we know about, reports checksum
status for every packet without being asked to, and can figure out packet
header lengths etc. for itself, so on the tx side, the driver just sets
some flags to say "checksum this for me please", and on the rx side, it
looks at two bits in the completion queue entry.
I'm mostly sending this out to see if anyone can gather any interesting
performance numbers.
Index: if_mcx.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
retrieving revision 1.44
diff -u -p -u -p -r1.44 if_mcx.c
--- if_mcx.c 24 Apr 2020 07:28:37 -0000 1.44
+++ if_mcx.c 18 May 2020 10:22:32 -0000
@@ -1255,6 +1292,10 @@ struct mcx_cq_entry {
uint32_t cq_checksum;
uint32_t __reserved__;
uint32_t cq_flags;
+#define MCX_CQ_ENTRY_FLAGS_L4_OK (1 << 26)
+#define MCX_CQ_ENTRY_FLAGS_L3_OK (1 << 25)
+#define MCX_CQ_ENTRY_FLAGS_L2_OK (1 << 24)
+
uint32_t cq_lro_srqn;
uint32_t __reserved__[2];
uint32_t cq_byte_cnt;
@@ -2355,7 +2396,9 @@ mcx_attach(struct device *parent, struct
ifp->if_qstart = mcx_start;
ifp->if_watchdog = mcx_watchdog;
ifp->if_hardmtu = sc->sc_hardmtu;
- ifp->if_capabilities = IFCAP_VLAN_MTU;
+ ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_IPv4 |
+ IFCAP_CSUM_UDPv4 | IFCAP_CSUM_UDPv6 | IFCAP_CSUM_TCPv4 |
+ IFCAP_CSUM_TCPv6;
IFQ_SET_MAXLEN(&ifp->if_snd, 1024);
ifmedia_init(&sc->sc_media, IFM_IMASK, mcx_media_change,
@@ -5662,6 +5966,7 @@ mcx_process_rx(struct mcx_softc *sc, str
struct mcx_slot *ms;
struct mbuf *m;
int slot;
+ uint32_t flags;
slot = betoh16(cqe->cq_wqe_count) % (1 << MCX_LOG_RQ_SIZE);
@@ -5680,6 +5985,13 @@ mcx_process_rx(struct mcx_softc *sc, str
betoh32(cqe->cq_rx_hash);
}
+ flags = bemtoh32(&cqe->cq_flags);
+ if (flags & MCX_CQ_ENTRY_FLAGS_L3_OK)
+ m->m_pkthdr.csum_flags = M_IPV4_CSUM_IN_OK;
+ if (flags & MCX_CQ_ENTRY_FLAGS_L4_OK)
+ m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK |
+ M_UDP_CSUM_IN_OK;
+
if (c->c_tdiff) {
uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
t *= c->c_udiff;
@@ -6343,6 +6657,7 @@ mcx_start(struct ifqueue *ifq)
sqe->sqe_signature = htobe32(MCX_SQE_CE_CQE_ALWAYS);
/* eth segment */
+ sqe->sqe_mss_csum = htobe32(MCX_SQE_L3_CSUM | MCX_SQE_L4_CSUM);
sqe->sqe_inline_header_size = htobe16(MCX_SQ_INLINE_SIZE);
m_copydata(m, 0, MCX_SQ_INLINE_SIZE,
(caddr_t)sqe->sqe_inline_headers);