From: Scott Mitchell <[email protected]> Add software checksum offload support and configurable TX poll behavior to improve flexibility and performance.
Add rte_net_ip_udptcp_cksum_mbuf in rte_net.h which is shared between rte_eth_tap and rte_eth_af_packet that supports IPv4/UDP/TCP checksums in software due to hardware offload and context propagation not being supported. Signed-off-by: Scott Mitchell <[email protected]> --- Depends-on: patch-160679 ("eal: add __rte_may_alias and __rte_aligned to unaligned typedefs") doc/guides/nics/features/afpacket.ini | 2 + doc/guides/rel_notes/release_26_03.rst | 2 + drivers/net/af_packet/rte_eth_af_packet.c | 42 ++++++++++---- drivers/net/tap/rte_eth_tap.c | 70 ++--------------------- lib/net/rte_net.c | 68 ++++++++++++++++++++++ lib/net/rte_net.h | 22 +++++++ 6 files changed, 130 insertions(+), 76 deletions(-) diff --git a/doc/guides/nics/features/afpacket.ini b/doc/guides/nics/features/afpacket.ini index 391f79b173..4bb81c84ff 100644 --- a/doc/guides/nics/features/afpacket.ini +++ b/doc/guides/nics/features/afpacket.ini @@ -7,5 +7,7 @@ Link status = Y Promiscuous mode = Y MTU update = Y +L3 checksum offload = Y +L4 checksum offload = Y Basic stats = Y Stats per queue = Y diff --git a/doc/guides/rel_notes/release_26_03.rst b/doc/guides/rel_notes/release_26_03.rst index 3b6be19645..2946acce99 100644 --- a/doc/guides/rel_notes/release_26_03.rst +++ b/doc/guides/rel_notes/release_26_03.rst @@ -60,6 +60,8 @@ New Features * Fixed kernel memory barrier protocol for memory availability * Fixed shared memory frame overhead offset calculation * Added ``txpollnotrdy`` devarg to avoid ``poll()`` blocking calls + * Added checksum offload support for ``IPV4_CKSUM``, ``UDP_CKSUM``, + and ``TCP_CKSUM`` Removed Items ------------- diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c index 9df1b1fd4c..662341ffc7 100644 --- a/drivers/net/af_packet/rte_eth_af_packet.c +++ b/drivers/net/af_packet/rte_eth_af_packet.c @@ -10,6 +10,8 @@ #include <rte_string_fns.h> #include <rte_mbuf.h> #include <rte_atomic.h> +#include <rte_ip.h> +#include <rte_net.h> #include <rte_bitops.h> #include <ethdev_driver.h> #include <ethdev_vdev.h> @@ -101,6 +103,7 @@ struct pmd_internals { struct pkt_tx_queue *tx_queue; uint8_t vlan_strip; uint8_t timestamp_offloading; + bool tx_sw_cksum; }; static const char *valid_arguments[] = { @@ -220,7 +223,7 @@ eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) /* account for the receive frame */ bufs[i] = mbuf; num_rx++; - num_rx_bytes += mbuf->pkt_len; + num_rx_bytes += rte_pktmbuf_pkt_len(mbuf); } pkt_q->framenum = framenum; pkt_q->rx_pkts += num_rx; @@ -256,6 +259,7 @@ eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) { struct tpacket2_hdr *ppd; struct rte_mbuf *mbuf; + struct rte_mbuf *seg; uint8_t *pbuf; unsigned int framecount, framenum; struct pollfd pfd; @@ -277,7 +281,7 @@ eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) mbuf = bufs[i]; /* Drop oversized packets. Insert VLAN if necessary */ - if (unlikely(mbuf->pkt_len > pkt_q->frame_data_size || + if (unlikely(rte_pktmbuf_pkt_len(mbuf) > pkt_q->frame_data_size || ((mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) != 0 && rte_vlan_insert(&mbuf) != 0))) { continue; @@ -308,23 +312,32 @@ eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) pbuf = (uint8_t *)ppd + ETH_AF_PACKET_FRAME_OVERHEAD; - ppd->tp_len = mbuf->pkt_len; - ppd->tp_snaplen = mbuf->pkt_len; + if (pkt_q->sw_cksum) { + seg = rte_net_ip_udptcp_cksum_mbuf(mbuf); + if (!seg) + continue; - struct rte_mbuf *tmp_mbuf = mbuf; + mbuf = seg; + bufs[i] = seg; + } + + ppd->tp_len = rte_pktmbuf_pkt_len(mbuf); + ppd->tp_snaplen = rte_pktmbuf_pkt_len(mbuf); + + seg = mbuf; do { - uint16_t data_len = rte_pktmbuf_data_len(tmp_mbuf); - memcpy(pbuf, rte_pktmbuf_mtod(tmp_mbuf, void*), data_len); + uint16_t data_len = rte_pktmbuf_data_len(seg); + memcpy(pbuf, rte_pktmbuf_mtod(seg, void*), data_len); pbuf += data_len; - tmp_mbuf = tmp_mbuf->next; - } while (tmp_mbuf); + seg = seg->next; + } while (seg); /* release incoming frame and advance ring buffer */ tpacket_write_status(&ppd->tp_status, TP_STATUS_SEND_REQUEST); if (++framenum >= framecount) framenum = 0; num_tx++; - num_tx_bytes += mbuf->pkt_len; + num_tx_bytes += rte_pktmbuf_pkt_len(mbuf); } rte_pktmbuf_free_bulk(&bufs[0], i); @@ -396,10 +409,13 @@ eth_dev_configure(struct rte_eth_dev *dev __rte_unused) { struct rte_eth_conf *dev_conf = &dev->data->dev_conf; const struct rte_eth_rxmode *rxmode = &dev_conf->rxmode; + const struct rte_eth_txmode *txmode = &dev_conf->txmode; struct pmd_internals *internals = dev->data->dev_private; internals->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP); internals->timestamp_offloading = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP); + internals->tx_sw_cksum = !!(txmode->offloads & (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | + RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_TCP_CKSUM)); return 0; } @@ -417,7 +433,10 @@ eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) dev_info->max_tx_queues = (uint16_t)internals->nb_queues; dev_info->min_rx_bufsize = ETH_AF_PACKET_ETH_OVERHEAD; dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS | - RTE_ETH_TX_OFFLOAD_VLAN_INSERT; + RTE_ETH_TX_OFFLOAD_VLAN_INSERT | + RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | + RTE_ETH_TX_OFFLOAD_UDP_CKSUM | + RTE_ETH_TX_OFFLOAD_TCP_CKSUM; dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP | RTE_ETH_RX_OFFLOAD_TIMESTAMP; @@ -618,6 +637,7 @@ eth_tx_queue_setup(struct rte_eth_dev *dev, { struct pmd_internals *internals = dev->data->dev_private; + internals->tx_queue[tx_queue_id].sw_cksum = internals->tx_sw_cksum; dev->data->tx_queues[tx_queue_id] = &internals->tx_queue[tx_queue_id]; return 0; diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c index 730f1859bd..c7ed6dfb8b 100644 --- a/drivers/net/tap/rte_eth_tap.c +++ b/drivers/net/tap/rte_eth_tap.c @@ -525,7 +525,6 @@ tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs, struct iovec iovecs[mbuf->nb_segs + 2]; struct tun_pi pi = { .flags = 0, .proto = 0x00 }; struct rte_mbuf *seg = mbuf; - uint64_t l4_ol_flags; int proto; int n; int j; @@ -556,74 +555,15 @@ tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs, iovecs[k].iov_len = sizeof(pi); k++; - l4_ol_flags = mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK; - if (txq->csum && (mbuf->ol_flags & RTE_MBUF_F_TX_IP_CKSUM || - l4_ol_flags == RTE_MBUF_F_TX_UDP_CKSUM || - l4_ol_flags == RTE_MBUF_F_TX_TCP_CKSUM)) { - unsigned int hdrlens = mbuf->l2_len + mbuf->l3_len; - uint16_t *l4_cksum; - void *l3_hdr; - - if (l4_ol_flags == RTE_MBUF_F_TX_UDP_CKSUM) - hdrlens += sizeof(struct rte_udp_hdr); - else if (l4_ol_flags == RTE_MBUF_F_TX_TCP_CKSUM) - hdrlens += sizeof(struct rte_tcp_hdr); - else if (l4_ol_flags != RTE_MBUF_F_TX_L4_NO_CKSUM) + if (txq->csum) { + seg = rte_net_ip_udptcp_cksum_mbuf(mbuf); + if (!seg) return -1; - /* Support only packets with at least layer 4 - * header included in the first segment - */ - if (rte_pktmbuf_data_len(mbuf) < hdrlens) - return -1; - - /* To change checksums (considering that a mbuf can be - * indirect, for example), copy l2, l3 and l4 headers - * in a new segment and chain it to existing data - */ - seg = rte_pktmbuf_copy(mbuf, mbuf->pool, 0, hdrlens); - if (seg == NULL) - return -1; - rte_pktmbuf_adj(mbuf, hdrlens); - rte_pktmbuf_chain(seg, mbuf); - pmbufs[i] = mbuf = seg; - - l3_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, mbuf->l2_len); - if (mbuf->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) { - struct rte_ipv4_hdr *iph = l3_hdr; - - iph->hdr_checksum = 0; - iph->hdr_checksum = rte_ipv4_cksum(iph); - } - - if (l4_ol_flags == RTE_MBUF_F_TX_L4_NO_CKSUM) - goto skip_l4_cksum; - - if (l4_ol_flags == RTE_MBUF_F_TX_UDP_CKSUM) { - struct rte_udp_hdr *udp_hdr; - - udp_hdr = rte_pktmbuf_mtod_offset(mbuf, struct rte_udp_hdr *, - mbuf->l2_len + mbuf->l3_len); - l4_cksum = &udp_hdr->dgram_cksum; - } else { - struct rte_tcp_hdr *tcp_hdr; - - tcp_hdr = rte_pktmbuf_mtod_offset(mbuf, struct rte_tcp_hdr *, - mbuf->l2_len + mbuf->l3_len); - l4_cksum = &tcp_hdr->cksum; - } - - *l4_cksum = 0; - if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) { - *l4_cksum = rte_ipv4_udptcp_cksum_mbuf(mbuf, l3_hdr, - mbuf->l2_len + mbuf->l3_len); - } else { - *l4_cksum = rte_ipv6_udptcp_cksum_mbuf(mbuf, l3_hdr, - mbuf->l2_len + mbuf->l3_len); - } + mbuf = seg; + pmbufs[i] = seg; } -skip_l4_cksum: for (j = 0; j < mbuf->nb_segs; j++) { iovecs[k].iov_len = rte_pktmbuf_data_len(seg); iovecs[k].iov_base = rte_pktmbuf_mtod(seg, void *); diff --git a/lib/net/rte_net.c b/lib/net/rte_net.c index 458b4814a9..1a0397bcd7 100644 --- a/lib/net/rte_net.c +++ b/lib/net/rte_net.c @@ -615,3 +615,71 @@ uint32_t rte_net_get_ptype(const struct rte_mbuf *m, return pkt_type; } + +RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_net_ip_udptcp_cksum_mbuf, 26.03) +struct rte_mbuf * +rte_net_ip_udptcp_cksum_mbuf(struct rte_mbuf *mbuf) +{ + const uint64_t l4_ol_flags = mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK; + const uint32_t l4_offset = mbuf->l2_len + mbuf->l3_len; + uint32_t hdrlens = l4_offset; + unaligned_uint16_t *l4_cksum = NULL; + void *l3_hdr; + + /* Quick check - nothing to do if no checksum offloads requested */ + if (!(mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK))) + return mbuf; + + /* Determine total header length needed */ + if (l4_ol_flags == RTE_MBUF_F_TX_UDP_CKSUM) + hdrlens += sizeof(struct rte_udp_hdr); + else if (l4_ol_flags == RTE_MBUF_F_TX_TCP_CKSUM) + hdrlens += sizeof(struct rte_tcp_hdr); + else if (l4_ol_flags != RTE_MBUF_F_TX_L4_NO_CKSUM) + return NULL; /* Unsupported L4 checksum type */ + + /* Validate we at least have L2+L3 headers */ + if (unlikely(rte_pktmbuf_data_len(mbuf) < l4_offset)) + return NULL; + + if (!RTE_MBUF_DIRECT(mbuf) || rte_mbuf_refcnt_read(mbuf) > 1) { + /* Indirect or shared - must copy, cannot modify in-place */ + struct rte_mbuf *seg = rte_pktmbuf_copy(mbuf, mbuf->pool, 0, hdrlens); + if (!seg) + return NULL; + + rte_pktmbuf_adj(mbuf, hdrlens); + rte_pktmbuf_chain(seg, mbuf); + mbuf = seg; + } else if (rte_pktmbuf_data_len(mbuf) < hdrlens && + (rte_pktmbuf_linearize(mbuf) < 0 || rte_pktmbuf_data_len(mbuf) < hdrlens)) { + /* failed: direct, non-shared, but segmented headers linearize in-place */ + return NULL; + } + /* else: Direct, non-shared, contiguous - can modify in-place, nothing to do */ + + l3_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, mbuf->l2_len); + + /* IPv4 header checksum */ + if (mbuf->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) { + struct rte_ipv4_hdr *iph = (struct rte_ipv4_hdr *)l3_hdr; + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + } + + /* L4 checksum */ + if (l4_ol_flags == RTE_MBUF_F_TX_UDP_CKSUM) + l4_cksum = &rte_pktmbuf_mtod_offset(mbuf, struct rte_udp_hdr *, + l4_offset)->dgram_cksum; + else if (l4_ol_flags == RTE_MBUF_F_TX_TCP_CKSUM) + l4_cksum = &rte_pktmbuf_mtod_offset(mbuf, struct rte_tcp_hdr *, l4_offset)->cksum; + + if (l4_cksum) { + *l4_cksum = 0; + *l4_cksum = (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) ? + rte_ipv4_udptcp_cksum_mbuf(mbuf, l3_hdr, l4_offset) : + rte_ipv6_udptcp_cksum_mbuf(mbuf, l3_hdr, l4_offset); + } + + return mbuf; +} diff --git a/lib/net/rte_net.h b/lib/net/rte_net.h index 65d724b84b..b258a86928 100644 --- a/lib/net/rte_net.h +++ b/lib/net/rte_net.h @@ -246,6 +246,28 @@ rte_net_intel_cksum_prepare(struct rte_mbuf *m) return rte_net_intel_cksum_flags_prepare(m, m->ol_flags); } +/** + * Compute IP and L4 checksums in software for mbufs with + * RTE_MBUF_F_TX_IP_CKSUM, RTE_MBUF_F_TX_UDP_CKSUM, or + * RTE_MBUF_F_TX_TCP_CKSUM offload flags set. + * + * On success, this function takes ownership of the input mbuf. The mbuf may be + * modified in-place (for direct, non-shared mbufs) or a new mbuf chain may be + * created (for indirect/shared mbufs) with the original becoming part of the chain. + * + * @param mbuf + * The packet mbuf to checksum. + * @return + * - On success: pointer to mbuf with checksums computed (may be same as input + * or a new mbuf chain). Caller must free only this returned pointer; the input + * mbuf pointer should not be freed separately as it may be part of the returned + * chain or may be the same as the returned pointer. + * - On error: NULL. Original mbuf remains valid and owned by caller. + */ +__rte_experimental +struct rte_mbuf * +rte_net_ip_udptcp_cksum_mbuf(struct rte_mbuf *mbuf); + #ifdef __cplusplus } #endif -- 2.39.5 (Apple Git-154)

