[ovs-dev] [PATCH v2 11/11] netdev: Use HW segmentation without outer UDP checksum.

David Marchand via dev Wed, 12 Nov 2025 09:06:38 -0800

If we can predict that the segmented traffic will have the same headers,
it is possible to rely on HW segmentation.


TCP over IPv4 geneve (with checksum on tunnel) on a mlx5 nic:
Before:  7.80 Gbits/sec, 100% cpu (SW segmentation)
After:  10.8  Gbits/sec,  27% cpu (HW segmentation)

For this optimisation, no touching of original tso_segsz should be
allowed, so revert the commit
844a7cfa6edd ("netdev-dpdk: Use guest TSO segmentation size hint.").

Yet, there might still be some non optimal cases where the L4 payload
size would not be a multiple of tso_segsz.
For this condition, "partially" segment: split the "last" segment and keep
n-1 segments data in the original packet which will be segmented by HW.

Reported-at: https://issues.redhat.com/browse/FDP-1897
Signed-off-by: David Marchand <[email protected]>
---
 lib/dp-packet-gso.c | 72 ++++++++++++++++++++++++++++++++++++++-------
 lib/dp-packet-gso.h |  2 ++
 lib/netdev-dpdk.c   |  7 -----
 lib/netdev.c        | 34 ++++++++++++++-------
 lib/packets.c       | 34 +++++++++++++++++++++
 5 files changed, 121 insertions(+), 28 deletions(-)

diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
index 6b8da8b370..d8d3a87500 100644
--- a/lib/dp-packet-gso.c
+++ b/lib/dp-packet-gso.c
@@ -77,6 +77,20 @@ dp_packet_gso_nr_segs(struct dp_packet *p)
     return DIV_ROUND_UP(data_length, segsz);
 }
 
+int
+dp_packet_gso_partial_nr_segs(struct dp_packet *p)
+{
+    if ((dp_packet_tunnel_geneve(p)
+         || dp_packet_tunnel_vxlan(p))
+        && dp_packet_l4_checksum_partial(p)
+        && dp_packet_get_inner_tcp_payload_length(p)
+           != dp_packet_gso_nr_segs(p) * dp_packet_get_tso_segsz(p)) {
+        return 2;
+    }
+
+    return 1;
+}
+
 static void
 dp_packet_gso_update_segment(struct dp_packet *seg, int seg_no, int n_segs,
                              uint16_t tso_segsz)
@@ -139,7 +153,7 @@ dp_packet_gso_update_segment(struct dp_packet *seg, int 
seg_no, int n_segs,
     tcp_seq += seg_no * tso_segsz;
     put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
 
-    if (OVS_LIKELY(seg_no < (n_segs - 1))) {
+    if (seg_no < (n_segs - 1) && !dp_packet_get_tso_segsz(seg)) {
         uint16_t tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
         /* Reset flags PUSH and FIN unless it is the last segment. */
         uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl)
@@ -160,12 +174,9 @@ dp_packet_gso_update_segment(struct dp_packet *seg, int 
seg_no, int n_segs,
     }
 }
 
-/* Perform software segmentation on packet 'p'.
- *
- * Segments packet 'p' into the array of preallocated batches in 'batches',
- * updating the 'batches' pointer as needed. */
-void
-dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
+static void
+dp_packet_gso__(struct dp_packet *p, struct dp_packet_batch **batches,
+                bool partial_seg)
 {
     struct dp_packet_batch *curr_batch = *batches;
     struct dp_packet *seg;
@@ -199,6 +210,13 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch 
**batches)
         data_len = dp_packet_get_tcp_payload_length(p);
     }
 
+    if (partial_seg) {
+        if (dp_packet_gso_partial_nr_segs(p) != 1) {
+            goto last_seg;
+        }
+        goto first_seg;
+    }
+
     for (int i = 1; i < n_segs - 1; i++) {
         seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + i * tso_segsz,
                                     tso_segsz);
@@ -210,6 +228,7 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch 
**batches)
         dp_packet_batch_add(curr_batch, seg);
     }
 
+last_seg:
     /* Create the last segment. */
     seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + (n_segs - 1) * tso_segsz,
                                 data_len - (n_segs - 1) * tso_segsz);
@@ -220,11 +239,44 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch 
**batches)
     }
     dp_packet_batch_add(curr_batch, seg);
 
-    /* Trim the first segment and reset TSO. */
-    dp_packet_set_size(p, hdr_len + tso_segsz);
-    dp_packet_set_tso_segsz(p, 0);
+first_seg:
+    if (partial_seg) {
+        if (dp_packet_gso_partial_nr_segs(p) != 1) {
+            dp_packet_set_size(p, hdr_len + (n_segs - 1) * tso_segsz);
+            if (n_segs == 2) {
+                /* No need to ask HW segmentation, we already did the job. */
+                dp_packet_set_tso_segsz(p, 0);
+            }
+        }
+    } else {
+        /* Trim the first segment and reset TSO. */
+        dp_packet_set_size(p, hdr_len + tso_segsz);
+        dp_packet_set_tso_segsz(p, 0);
+    }
     dp_packet_gso_update_segment(p, 0, n_segs, tso_segsz);
 
 out:
     *batches = curr_batch;
 }
+
+/* Perform software segmentation on packet 'p'.
+ *
+ * Segments packet 'p' into the array of preallocated batches in 'batches',
+ * updating the 'batches' pointer as needed. */
+void
+dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
+{
+    dp_packet_gso__(p, batches, false);
+}
+
+/* Perform partial software segmentation on packet 'p'.
+ *
+ * For UDP tunnels, if the packet payload length is not aligned on the
+ * segmentation size, segments the last segment of packet 'p' into the array
+ * of preallocated batches in 'batches', updating the 'batches' pointer
+ * as needed. */
+void
+dp_packet_gso_partial(struct dp_packet *p, struct dp_packet_batch **batches)
+{
+    dp_packet_gso__(p, batches, true);
+}
diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h
index 2abc19ea31..ed55a11d79 100644
--- a/lib/dp-packet-gso.h
+++ b/lib/dp-packet-gso.h
@@ -19,5 +19,7 @@
 
 void dp_packet_gso(struct dp_packet *, struct dp_packet_batch **);
 int dp_packet_gso_nr_segs(struct dp_packet *);
+void dp_packet_gso_partial(struct dp_packet *, struct dp_packet_batch **);
+int dp_packet_gso_partial_nr_segs(struct dp_packet *);
 
 #endif /* dp-packet-gso.h */
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 38cf0ebb2e..55278c2450 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2725,20 +2725,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, 
struct rte_mbuf *mbuf)
 
     if (mbuf->tso_segsz) {
         struct tcp_header *th = l4;
-        uint16_t link_tso_segsz;
         int hdr_len;
 
         mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
 
         hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
-        link_tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
         if (dp_packet_tunnel(pkt)) {
             hdr_len += mbuf->outer_l2_len + mbuf->outer_l3_len;
-            link_tso_segsz -= mbuf->outer_l3_len + mbuf->l2_len;
-        }
-
-        if (mbuf->tso_segsz > link_tso_segsz) {
-            mbuf->tso_segsz = link_tso_segsz;
         }
 
         if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len)) {
diff --git a/lib/netdev.c b/lib/netdev.c
index f493dab005..5413ca61d0 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -71,6 +71,7 @@ COVERAGE_DEFINE(netdev_add_router);
 COVERAGE_DEFINE(netdev_get_stats);
 COVERAGE_DEFINE(netdev_push_header_drops);
 COVERAGE_DEFINE(netdev_soft_seg_good);
+COVERAGE_DEFINE(netdev_partial_seg_good);
 
 struct netdev_saved_flags {
     struct netdev *netdev;
@@ -802,7 +803,8 @@ netdev_get_pt_mode(const struct netdev *netdev)
  * from netdev_class->send() if at least one batch failed to send. */
 static int
 netdev_send_tso(struct netdev *netdev, int qid,
-                struct dp_packet_batch *batch, bool concurrent_txq)
+                struct dp_packet_batch *batch, bool concurrent_txq,
+                bool partial_seg)
 {
     struct dp_packet_batch *batches;
     struct dp_packet *packet;
@@ -812,11 +814,15 @@ netdev_send_tso(struct netdev *netdev, int qid,
     int error;
 
     /* Calculate the total number of packets in the batch after
-     * the segmentation. */
+     * the (partial?) segmentation. */
     n_packets = 0;
     DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
         if (dp_packet_get_tso_segsz(packet)) {
-            n_packets += dp_packet_gso_nr_segs(packet);
+            if (partial_seg) {
+                n_packets += dp_packet_gso_partial_nr_segs(packet);
+            } else {
+                n_packets += dp_packet_gso_nr_segs(packet);
+            }
         } else {
             n_packets++;
         }
@@ -842,8 +848,13 @@ netdev_send_tso(struct netdev *netdev, int qid,
     curr_batch = batches;
     DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) {
         if (dp_packet_get_tso_segsz(packet)) {
-            dp_packet_gso(packet, &curr_batch);
-            COVERAGE_INC(netdev_soft_seg_good);
+            if (partial_seg) {
+                dp_packet_gso_partial(packet, &curr_batch);
+                COVERAGE_INC(netdev_partial_seg_good);
+            } else {
+                dp_packet_gso(packet, &curr_batch);
+                COVERAGE_INC(netdev_soft_seg_good);
+            }
         } else {
             if (dp_packet_batch_is_full(curr_batch)) {
                 curr_batch++;
@@ -906,7 +917,8 @@ netdev_send(struct netdev *netdev, int qid, struct 
dp_packet_batch *batch,
         if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
             DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
                 if (dp_packet_get_tso_segsz(packet)) {
-                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+                    return netdev_send_tso(netdev, qid, batch, concurrent_txq,
+                                           false);
                 }
             }
         } else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO |
@@ -915,16 +927,16 @@ netdev_send(struct netdev *netdev, int qid, struct 
dp_packet_batch *batch,
             DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
                 if (dp_packet_get_tso_segsz(packet)
                     && dp_packet_tunnel(packet)) {
-                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+                    return netdev_send_tso(netdev, qid, batch, concurrent_txq,
+                                           false);
                 }
             }
         } else if (!(netdev_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) {
             DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
                 if (dp_packet_get_tso_segsz(packet)
-                    && (dp_packet_tunnel_vxlan(packet)
-                        || dp_packet_tunnel_geneve(packet))
-                    && dp_packet_l4_checksum_partial(packet)) {
-                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+                    && dp_packet_gso_partial_nr_segs(packet) != 1) {
+                    return netdev_send_tso(netdev, qid, batch, concurrent_txq,
+                                           true);
                 }
             }
         }
diff --git a/lib/packets.c b/lib/packets.c
index c05b4abcc8..4e5cac66e4 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -33,6 +33,7 @@
 #include "ovs-thread.h"
 #include "odp-util.h"
 #include "dp-packet.h"
+#include "dp-packet-gso.h"
 #include "unaligned.h"
 
 const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT;
@@ -2103,6 +2104,7 @@ packet_udp_tunnel_csum(struct dp_packet *p)
     uint32_t partial_csum;
     struct ip_header *ip;
     uint32_t inner_csum;
+    uint16_t tso_segsz;
     void *inner_l4;
 
     inner_ip = dp_packet_inner_l3(p);
@@ -2180,6 +2182,38 @@ packet_udp_tunnel_csum(struct dp_packet *p)
     partial_csum = csum_continue(partial_csum, inner_l4_csum_p + 1,
         (char *) inner_l4_data - (char *)(inner_l4_csum_p + 1));
     udp->udp_csum = csum_finish(partial_csum);
+    tso_segsz = dp_packet_get_tso_segsz(p);
+    if (tso_segsz) {
+        uint16_t payload_len = dp_packet_get_inner_tcp_payload_length(p);
+
+        ovs_assert(payload_len == tso_segsz * dp_packet_gso_nr_segs(p));
+
+        /* The pseudo header used in the outer UDP checksum is dependent on
+         * the ip_tot_len / ip6_plen which was a reflection of the TSO frame
+         * size. The segmented packet will be shorter. */
+        udp->udp_csum = recalc_csum16(udp->udp_csum, htons(payload_len),
+                                      htons(tso_segsz));
+
+        /* When segmenting the packet, various headers get updated:
+         * - inner L3
+         *   - for IPv4, ip_tot_len is updated, BUT it is not affecting the
+         *     outer UDP checksum because the IPv4 header itself contains
+         *     a checksum that compensates for this update,
+         *   - for IPv6, ip6_plen is updated, and this must be considered,
+         * - inner L4
+         *   - inner pseudo header used in the TCP checksum is dependent on
+         *     the inner ip_tot_len / ip6_plen,
+         *   - TCP seq number is updated,
+         *   - the HW may change some TCP flags (think PSH/FIN),
+         *   BUT the TCP checksum will compensate for those updates,
+         *
+         * Summary: we only to care about the inner IPv6 header update.
+         */
+        if (IP_VER(inner_ip->ip_ihl_ver) != 4) {
+            udp->udp_csum = recalc_csum16(udp->udp_csum, htons(payload_len),
+                                          htons(tso_segsz));
+        }
+    }
     if (!udp->udp_csum) {
         udp->udp_csum = htons(0xffff);
     }
-- 
2.51.0

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v2 11/11] netdev: Use HW segmentation without outer UDP checksum.

Reply via email to