On Tue, May 19, 2020 at 08:48:17AM +1000, Jonathan Matthew wrote:
> So far I've completely ignored offloads in the ethernet drivers I've
> written, but on having a quick look at the documentation I found that
> mcx(4) checksum offload is extremely easy to use, and some simple testing
> suggests that it helps quite a bit. I've seen tcpbench receive throughput
> increase by around 15%.
>
> The nic supports all the checksum offloads we know about, reports checksum
> status for every packet without being asked to, and can figure out packet
> header lengths etc. for itself, so on the tx side, the driver just sets
> some flags to say "checksum this for me please", and on the rx side, it
> looks at two bits in the completion queue entry.
>
> I'm mostly sending this out to see if anyone can gather any interesting
> performance numbers.
ipv4 forwarding, 64Byte UDP packagesi sent over both mcx, pfctl -d
with patch:
- generating 560 Kpps: 560 Kpps stable
- generating 1 Mpps: first few min 809 Kpps, then drops to 520 Kpps
without patch:
- generating 560 Kpps: first few min 560 Kpps theni drops to 514 Kpps
- generating 1Mpps: first few min 766 Kpps, then drops to 500 Kpps
mcx0 at pci7 dev 0 function 0 "Mellanox ConnectX-4 Lx" rev 0x00: FW 14.17.2032,
msix, address 24:8a:07:b0:23:a0
mcx1 at pci7 dev 0 function 1 "Mellanox ConnectX-4 Lx" rev 0x00: FW 14.17.2032,
msix, address 24:8a:07:b0:23:a1
>
>
> Index: if_mcx.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
> retrieving revision 1.44
> diff -u -p -u -p -r1.44 if_mcx.c
> --- if_mcx.c 24 Apr 2020 07:28:37 -0000 1.44
> +++ if_mcx.c 18 May 2020 10:22:32 -0000
> @@ -1255,6 +1292,10 @@ struct mcx_cq_entry {
> uint32_t cq_checksum;
> uint32_t __reserved__;
> uint32_t cq_flags;
> +#define MCX_CQ_ENTRY_FLAGS_L4_OK (1 << 26)
> +#define MCX_CQ_ENTRY_FLAGS_L3_OK (1 << 25)
> +#define MCX_CQ_ENTRY_FLAGS_L2_OK (1 << 24)
> +
> uint32_t cq_lro_srqn;
> uint32_t __reserved__[2];
> uint32_t cq_byte_cnt;
> @@ -2355,7 +2396,9 @@ mcx_attach(struct device *parent, struct
> ifp->if_qstart = mcx_start;
> ifp->if_watchdog = mcx_watchdog;
> ifp->if_hardmtu = sc->sc_hardmtu;
> - ifp->if_capabilities = IFCAP_VLAN_MTU;
> + ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_IPv4 |
> + IFCAP_CSUM_UDPv4 | IFCAP_CSUM_UDPv6 | IFCAP_CSUM_TCPv4 |
> + IFCAP_CSUM_TCPv6;
> IFQ_SET_MAXLEN(&ifp->if_snd, 1024);
>
> ifmedia_init(&sc->sc_media, IFM_IMASK, mcx_media_change,
> @@ -5662,6 +5966,7 @@ mcx_process_rx(struct mcx_softc *sc, str
> struct mcx_slot *ms;
> struct mbuf *m;
> int slot;
> + uint32_t flags;
>
> slot = betoh16(cqe->cq_wqe_count) % (1 << MCX_LOG_RQ_SIZE);
>
> @@ -5680,6 +5985,13 @@ mcx_process_rx(struct mcx_softc *sc, str
> betoh32(cqe->cq_rx_hash);
> }
>
> + flags = bemtoh32(&cqe->cq_flags);
> + if (flags & MCX_CQ_ENTRY_FLAGS_L3_OK)
> + m->m_pkthdr.csum_flags = M_IPV4_CSUM_IN_OK;
> + if (flags & MCX_CQ_ENTRY_FLAGS_L4_OK)
> + m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK |
> + M_UDP_CSUM_IN_OK;
> +
> if (c->c_tdiff) {
> uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
> t *= c->c_udiff;
> @@ -6343,6 +6657,7 @@ mcx_start(struct ifqueue *ifq)
> sqe->sqe_signature = htobe32(MCX_SQE_CE_CQE_ALWAYS);
>
> /* eth segment */
> + sqe->sqe_mss_csum = htobe32(MCX_SQE_L3_CSUM | MCX_SQE_L4_CSUM);
> sqe->sqe_inline_header_size = htobe16(MCX_SQ_INLINE_SIZE);
> m_copydata(m, 0, MCX_SQ_INLINE_SIZE,
> (caddr_t)sqe->sqe_inline_headers);
>