If we can predict that the segmented traffic will have the same headers,
it is possible to rely on HW segmentation.

TCP over IPv4 geneve (with checksum on tunnel) on a mlx5 nic:
Before:  7.80 Gbits/sec, 100% cpu (SW segmentation)
After:  10.8  Gbits/sec,  27% cpu (HW segmentation)

For this optimisation, no touching of original tso_segsz should be
allowed, so revert the commit
844a7cfa6edd ("netdev-dpdk: Use guest TSO segmentation size hint.").

Yet, there might still be some non optimal cases where the L4 payload
size would not be a multiple of tso_segsz.
For this condition, "partially" segment: split the "last" segment and keep
n-1 segments data in the original packet which will be segmented by HW.

Reported-at: https://issues.redhat.com/browse/FDP-1897
Signed-off-by: David Marchand <[email protected]>
---
 lib/dp-packet-gso.c | 72 ++++++++++++++++++++++++++++++++++++++-------
 lib/dp-packet-gso.h |  2 ++
 lib/netdev-dpdk.c   |  7 -----
 lib/netdev.c        | 34 ++++++++++++++-------
 lib/packets.c       | 34 +++++++++++++++++++++
 5 files changed, 121 insertions(+), 28 deletions(-)

diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
index 6b8da8b370..d8d3a87500 100644
--- a/lib/dp-packet-gso.c
+++ b/lib/dp-packet-gso.c
@@ -77,6 +77,20 @@ dp_packet_gso_nr_segs(struct dp_packet *p)
     return DIV_ROUND_UP(data_length, segsz);
 }
 
+int
+dp_packet_gso_partial_nr_segs(struct dp_packet *p)
+{
+    if ((dp_packet_tunnel_geneve(p)
+         || dp_packet_tunnel_vxlan(p))
+        && dp_packet_l4_checksum_partial(p)
+        && dp_packet_get_inner_tcp_payload_length(p)
+           != dp_packet_gso_nr_segs(p) * dp_packet_get_tso_segsz(p)) {
+        return 2;
+    }
+
+    return 1;
+}
+
 static void
 dp_packet_gso_update_segment(struct dp_packet *seg, int seg_no, int n_segs,
                              uint16_t tso_segsz)
@@ -139,7 +153,7 @@ dp_packet_gso_update_segment(struct dp_packet *seg, int 
seg_no, int n_segs,
     tcp_seq += seg_no * tso_segsz;
     put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
 
-    if (OVS_LIKELY(seg_no < (n_segs - 1))) {
+    if (seg_no < (n_segs - 1) && !dp_packet_get_tso_segsz(seg)) {
         uint16_t tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
         /* Reset flags PUSH and FIN unless it is the last segment. */
         uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl)
@@ -160,12 +174,9 @@ dp_packet_gso_update_segment(struct dp_packet *seg, int 
seg_no, int n_segs,
     }
 }
 
-/* Perform software segmentation on packet 'p'.
- *
- * Segments packet 'p' into the array of preallocated batches in 'batches',
- * updating the 'batches' pointer as needed. */
-void
-dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
+static void
+dp_packet_gso__(struct dp_packet *p, struct dp_packet_batch **batches,
+                bool partial_seg)
 {
     struct dp_packet_batch *curr_batch = *batches;
     struct dp_packet *seg;
@@ -199,6 +210,13 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch 
**batches)
         data_len = dp_packet_get_tcp_payload_length(p);
     }
 
+    if (partial_seg) {
+        if (dp_packet_gso_partial_nr_segs(p) != 1) {
+            goto last_seg;
+        }
+        goto first_seg;
+    }
+
     for (int i = 1; i < n_segs - 1; i++) {
         seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + i * tso_segsz,
                                     tso_segsz);
@@ -210,6 +228,7 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch 
**batches)
         dp_packet_batch_add(curr_batch, seg);
     }
 
+last_seg:
     /* Create the last segment. */
     seg = dp_packet_gso_seg_new(p, hdr_len, hdr_len + (n_segs - 1) * tso_segsz,
                                 data_len - (n_segs - 1) * tso_segsz);
@@ -220,11 +239,44 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch 
**batches)
     }
     dp_packet_batch_add(curr_batch, seg);
 
-    /* Trim the first segment and reset TSO. */
-    dp_packet_set_size(p, hdr_len + tso_segsz);
-    dp_packet_set_tso_segsz(p, 0);
+first_seg:
+    if (partial_seg) {
+        if (dp_packet_gso_partial_nr_segs(p) != 1) {
+            dp_packet_set_size(p, hdr_len + (n_segs - 1) * tso_segsz);
+            if (n_segs == 2) {
+                /* No need to ask HW segmentation, we already did the job. */
+                dp_packet_set_tso_segsz(p, 0);
+            }
+        }
+    } else {
+        /* Trim the first segment and reset TSO. */
+        dp_packet_set_size(p, hdr_len + tso_segsz);
+        dp_packet_set_tso_segsz(p, 0);
+    }
     dp_packet_gso_update_segment(p, 0, n_segs, tso_segsz);
 
 out:
     *batches = curr_batch;
 }
+
+/* Perform software segmentation on packet 'p'.
+ *
+ * Segments packet 'p' into the array of preallocated batches in 'batches',
+ * updating the 'batches' pointer as needed. */
+void
+dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
+{
+    dp_packet_gso__(p, batches, false);
+}
+
+/* Perform partial software segmentation on packet 'p'.
+ *
+ * For UDP tunnels, if the packet payload length is not aligned on the
+ * segmentation size, segments the last segment of packet 'p' into the array
+ * of preallocated batches in 'batches', updating the 'batches' pointer
+ * as needed. */
+void
+dp_packet_gso_partial(struct dp_packet *p, struct dp_packet_batch **batches)
+{
+    dp_packet_gso__(p, batches, true);
+}
diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h
index 2abc19ea31..ed55a11d79 100644
--- a/lib/dp-packet-gso.h
+++ b/lib/dp-packet-gso.h
@@ -19,5 +19,7 @@
 
 void dp_packet_gso(struct dp_packet *, struct dp_packet_batch **);
 int dp_packet_gso_nr_segs(struct dp_packet *);
+void dp_packet_gso_partial(struct dp_packet *, struct dp_packet_batch **);
+int dp_packet_gso_partial_nr_segs(struct dp_packet *);
 
 #endif /* dp-packet-gso.h */
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 38cf0ebb2e..55278c2450 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2725,20 +2725,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, 
struct rte_mbuf *mbuf)
 
     if (mbuf->tso_segsz) {
         struct tcp_header *th = l4;
-        uint16_t link_tso_segsz;
         int hdr_len;
 
         mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
 
         hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
-        link_tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
         if (dp_packet_tunnel(pkt)) {
             hdr_len += mbuf->outer_l2_len + mbuf->outer_l3_len;
-            link_tso_segsz -= mbuf->outer_l3_len + mbuf->l2_len;
-        }
-
-        if (mbuf->tso_segsz > link_tso_segsz) {
-            mbuf->tso_segsz = link_tso_segsz;
         }
 
         if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len)) {
diff --git a/lib/netdev.c b/lib/netdev.c
index f493dab005..5413ca61d0 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -71,6 +71,7 @@ COVERAGE_DEFINE(netdev_add_router);
 COVERAGE_DEFINE(netdev_get_stats);
 COVERAGE_DEFINE(netdev_push_header_drops);
 COVERAGE_DEFINE(netdev_soft_seg_good);
+COVERAGE_DEFINE(netdev_partial_seg_good);
 
 struct netdev_saved_flags {
     struct netdev *netdev;
@@ -802,7 +803,8 @@ netdev_get_pt_mode(const struct netdev *netdev)
  * from netdev_class->send() if at least one batch failed to send. */
 static int
 netdev_send_tso(struct netdev *netdev, int qid,
-                struct dp_packet_batch *batch, bool concurrent_txq)
+                struct dp_packet_batch *batch, bool concurrent_txq,
+                bool partial_seg)
 {
     struct dp_packet_batch *batches;
     struct dp_packet *packet;
@@ -812,11 +814,15 @@ netdev_send_tso(struct netdev *netdev, int qid,
     int error;
 
     /* Calculate the total number of packets in the batch after
-     * the segmentation. */
+     * the (partial?) segmentation. */
     n_packets = 0;
     DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
         if (dp_packet_get_tso_segsz(packet)) {
-            n_packets += dp_packet_gso_nr_segs(packet);
+            if (partial_seg) {
+                n_packets += dp_packet_gso_partial_nr_segs(packet);
+            } else {
+                n_packets += dp_packet_gso_nr_segs(packet);
+            }
         } else {
             n_packets++;
         }
@@ -842,8 +848,13 @@ netdev_send_tso(struct netdev *netdev, int qid,
     curr_batch = batches;
     DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) {
         if (dp_packet_get_tso_segsz(packet)) {
-            dp_packet_gso(packet, &curr_batch);
-            COVERAGE_INC(netdev_soft_seg_good);
+            if (partial_seg) {
+                dp_packet_gso_partial(packet, &curr_batch);
+                COVERAGE_INC(netdev_partial_seg_good);
+            } else {
+                dp_packet_gso(packet, &curr_batch);
+                COVERAGE_INC(netdev_soft_seg_good);
+            }
         } else {
             if (dp_packet_batch_is_full(curr_batch)) {
                 curr_batch++;
@@ -906,7 +917,8 @@ netdev_send(struct netdev *netdev, int qid, struct 
dp_packet_batch *batch,
         if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
             DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
                 if (dp_packet_get_tso_segsz(packet)) {
-                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+                    return netdev_send_tso(netdev, qid, batch, concurrent_txq,
+                                           false);
                 }
             }
         } else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO |
@@ -915,16 +927,16 @@ netdev_send(struct netdev *netdev, int qid, struct 
dp_packet_batch *batch,
             DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
                 if (dp_packet_get_tso_segsz(packet)
                     && dp_packet_tunnel(packet)) {
-                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+                    return netdev_send_tso(netdev, qid, batch, concurrent_txq,
+                                           false);
                 }
             }
         } else if (!(netdev_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) {
             DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
                 if (dp_packet_get_tso_segsz(packet)
-                    && (dp_packet_tunnel_vxlan(packet)
-                        || dp_packet_tunnel_geneve(packet))
-                    && dp_packet_l4_checksum_partial(packet)) {
-                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+                    && dp_packet_gso_partial_nr_segs(packet) != 1) {
+                    return netdev_send_tso(netdev, qid, batch, concurrent_txq,
+                                           true);
                 }
             }
         }
diff --git a/lib/packets.c b/lib/packets.c
index c05b4abcc8..4e5cac66e4 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -33,6 +33,7 @@
 #include "ovs-thread.h"
 #include "odp-util.h"
 #include "dp-packet.h"
+#include "dp-packet-gso.h"
 #include "unaligned.h"
 
 const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT;
@@ -2103,6 +2104,7 @@ packet_udp_tunnel_csum(struct dp_packet *p)
     uint32_t partial_csum;
     struct ip_header *ip;
     uint32_t inner_csum;
+    uint16_t tso_segsz;
     void *inner_l4;
 
     inner_ip = dp_packet_inner_l3(p);
@@ -2180,6 +2182,38 @@ packet_udp_tunnel_csum(struct dp_packet *p)
     partial_csum = csum_continue(partial_csum, inner_l4_csum_p + 1,
         (char *) inner_l4_data - (char *)(inner_l4_csum_p + 1));
     udp->udp_csum = csum_finish(partial_csum);
+    tso_segsz = dp_packet_get_tso_segsz(p);
+    if (tso_segsz) {
+        uint16_t payload_len = dp_packet_get_inner_tcp_payload_length(p);
+
+        ovs_assert(payload_len == tso_segsz * dp_packet_gso_nr_segs(p));
+
+        /* The pseudo header used in the outer UDP checksum is dependent on
+         * the ip_tot_len / ip6_plen which was a reflection of the TSO frame
+         * size. The segmented packet will be shorter. */
+        udp->udp_csum = recalc_csum16(udp->udp_csum, htons(payload_len),
+                                      htons(tso_segsz));
+
+        /* When segmenting the packet, various headers get updated:
+         * - inner L3
+         *   - for IPv4, ip_tot_len is updated, BUT it is not affecting the
+         *     outer UDP checksum because the IPv4 header itself contains
+         *     a checksum that compensates for this update,
+         *   - for IPv6, ip6_plen is updated, and this must be considered,
+         * - inner L4
+         *   - inner pseudo header used in the TCP checksum is dependent on
+         *     the inner ip_tot_len / ip6_plen,
+         *   - TCP seq number is updated,
+         *   - the HW may change some TCP flags (think PSH/FIN),
+         *   BUT the TCP checksum will compensate for those updates,
+         *
+         * Summary: we only to care about the inner IPv6 header update.
+         */
+        if (IP_VER(inner_ip->ip_ihl_ver) != 4) {
+            udp->udp_csum = recalc_csum16(udp->udp_csum, htons(payload_len),
+                                          htons(tso_segsz));
+        }
+    }
     if (!udp->udp_csum) {
         udp->udp_csum = htons(0xffff);
     }
-- 
2.51.0

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to