On Tue, May 19, 2020 at 08:48:17AM +1000, Jonathan Matthew wrote:
> So far I've completely ignored offloads in the ethernet drivers I've
> written, but on having a quick look at the documentation I found that
> mcx(4) checksum offload is extremely easy to use, and some simple testing
> suggests that it helps quite a bit.  I've seen tcpbench receive throughput
> increase by around 15%.
> 
> The nic supports all the checksum offloads we know about, reports checksum
> status for every packet without being asked to, and can figure out packet
> header lengths etc. for itself, so on the tx side, the driver just sets
> some flags to say "checksum this for me please", and on the rx side, it
> looks at two bits in the completion queue entry.
> 
> I'm mostly sending this out to see if anyone can gather any interesting
> performance numbers.

ipv4 forwarding, 64Byte UDP packagesi sent over both mcx, pfctl -d

with patch:
- generating 560 Kpps: 560 Kpps stable
- generating 1 Mpps: first few min 809 Kpps, then drops to 520 Kpps

without patch:
- generating 560 Kpps: first few min 560 Kpps theni drops to 514 Kpps
- generating 1Mpps: first few min 766 Kpps, then drops to 500 Kpps

mcx0 at pci7 dev 0 function 0 "Mellanox ConnectX-4 Lx" rev 0x00: FW 14.17.2032, 
msix, address 24:8a:07:b0:23:a0
mcx1 at pci7 dev 0 function 1 "Mellanox ConnectX-4 Lx" rev 0x00: FW 14.17.2032, 
msix, address 24:8a:07:b0:23:a1

> 
> 
> Index: if_mcx.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
> retrieving revision 1.44
> diff -u -p -u -p -r1.44 if_mcx.c
> --- if_mcx.c  24 Apr 2020 07:28:37 -0000      1.44
> +++ if_mcx.c  18 May 2020 10:22:32 -0000
> @@ -1255,6 +1292,10 @@ struct mcx_cq_entry {
>       uint32_t                cq_checksum;
>       uint32_t                __reserved__;
>       uint32_t                cq_flags;
> +#define MCX_CQ_ENTRY_FLAGS_L4_OK             (1 << 26)
> +#define MCX_CQ_ENTRY_FLAGS_L3_OK             (1 << 25)
> +#define MCX_CQ_ENTRY_FLAGS_L2_OK             (1 << 24)
> +
>       uint32_t                cq_lro_srqn;
>       uint32_t                __reserved__[2];
>       uint32_t                cq_byte_cnt;
> @@ -2355,7 +2396,9 @@ mcx_attach(struct device *parent, struct
>       ifp->if_qstart = mcx_start;
>       ifp->if_watchdog = mcx_watchdog;
>       ifp->if_hardmtu = sc->sc_hardmtu;
> -     ifp->if_capabilities = IFCAP_VLAN_MTU;
> +     ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_CSUM_IPv4 |
> +         IFCAP_CSUM_UDPv4 | IFCAP_CSUM_UDPv6 | IFCAP_CSUM_TCPv4 |
> +         IFCAP_CSUM_TCPv6;
>       IFQ_SET_MAXLEN(&ifp->if_snd, 1024);
>  
>       ifmedia_init(&sc->sc_media, IFM_IMASK, mcx_media_change,
> @@ -5662,6 +5966,7 @@ mcx_process_rx(struct mcx_softc *sc, str
>       struct mcx_slot *ms;
>       struct mbuf *m;
>       int slot;
> +     uint32_t flags;
>  
>       slot = betoh16(cqe->cq_wqe_count) % (1 << MCX_LOG_RQ_SIZE);
>  
> @@ -5680,6 +5985,13 @@ mcx_process_rx(struct mcx_softc *sc, str
>                   betoh32(cqe->cq_rx_hash);
>       }
>  
> +     flags = bemtoh32(&cqe->cq_flags);
> +     if (flags & MCX_CQ_ENTRY_FLAGS_L3_OK)
> +             m->m_pkthdr.csum_flags = M_IPV4_CSUM_IN_OK;
> +     if (flags & MCX_CQ_ENTRY_FLAGS_L4_OK)
> +             m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK |
> +                 M_UDP_CSUM_IN_OK;
> +
>       if (c->c_tdiff) {
>               uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
>               t *= c->c_udiff;
> @@ -6343,6 +6657,7 @@ mcx_start(struct ifqueue *ifq)
>               sqe->sqe_signature = htobe32(MCX_SQE_CE_CQE_ALWAYS);
>  
>               /* eth segment */
> +             sqe->sqe_mss_csum = htobe32(MCX_SQE_L3_CSUM | MCX_SQE_L4_CSUM);
>               sqe->sqe_inline_header_size = htobe16(MCX_SQ_INLINE_SIZE);
>               m_copydata(m, 0, MCX_SQ_INLINE_SIZE,
>                   (caddr_t)sqe->sqe_inline_headers);
> 

Reply via email to