When sending packets that are flagged as requiring segmentation to an
interface that doens't support this feature, send the packet to the TSO
software fallback instead of dropping it.

Signed-off-by: Mike Pattrick <[email protected]>

---
v2:
 - Fixed udp tunnel length
 - Added test that UDP headers are correct
 - Split inner and outer ip_id into different counters
 - Set tunnel flags in reset_tcp_seg

v3:
 - Changed logic of netdev_send() to account for NICs that support
 tunnel offload but not checksum offload
 - Adjusted udp tunnel header length during software tso
---
 lib/dp-packet-gso.c     | 87 +++++++++++++++++++++++++++++++++--------
 lib/dp-packet.h         | 34 ++++++++++++++++
 lib/netdev-native-tnl.c |  8 ++++
 lib/netdev.c            | 44 ++++++++++-----------
 tests/system-traffic.at | 87 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 221 insertions(+), 39 deletions(-)

diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
index 847685ad9..4ac16b7d5 100644
--- a/lib/dp-packet-gso.c
+++ b/lib/dp-packet-gso.c
@@ -47,6 +47,8 @@ dp_packet_gso_seg_new(const struct dp_packet *p, size_t 
hdr_len,
     seg->l2_5_ofs = p->l2_5_ofs;
     seg->l3_ofs = p->l3_ofs;
     seg->l4_ofs = p->l4_ofs;
+    seg->inner_l3_ofs = p->inner_l3_ofs;
+    seg->inner_l4_ofs = p->inner_l4_ofs;
 
     /* The protocol headers remain the same, so preserve hash and mark. */
     *dp_packet_rss_ptr(seg) = *dp_packet_rss_ptr(p);
@@ -71,7 +73,12 @@ dp_packet_gso_nr_segs(struct dp_packet *p)
     const char *data_tail;
     const char *data_pos;
 
-    data_pos = dp_packet_get_tcp_payload(p);
+    if (dp_packet_hwol_is_tunnel_vxlan(p) ||
+        dp_packet_hwol_is_tunnel_geneve(p)) {
+        data_pos = dp_packet_get_inner_tcp_payload(p);
+    } else {
+        data_pos = dp_packet_get_tcp_payload(p);
+    }
     data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p);
 
     return DIV_ROUND_UP(data_tail - data_pos, segsz);
@@ -90,13 +97,16 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch 
**batches)
     struct dp_packet_batch *curr_batch = *batches;
     struct tcp_header *tcp_hdr;
     struct ip_header *ip_hdr;
+    uint16_t inner_ip_id = 0;
+    uint16_t outer_ip_id = 0;
     struct dp_packet *seg;
     uint16_t tcp_offset;
     uint16_t tso_segsz;
     uint32_t tcp_seq;
-    uint16_t ip_id;
+    bool outer_ipv4;
     int hdr_len;
     int seg_len;
+    bool tnl;
 
     tso_segsz = dp_packet_get_tso_segsz(p);
     if (!tso_segsz) {
@@ -105,20 +115,37 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch 
**batches)
         return false;
     }
 
-    tcp_hdr = dp_packet_l4(p);
-    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
-    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
-    hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p))
-              + tcp_offset * 4;
-    ip_id = 0;
-    if (dp_packet_hwol_is_ipv4(p)) {
+    if (dp_packet_hwol_is_tunnel_vxlan(p) ||
+            dp_packet_hwol_is_tunnel_geneve(p)) {
+        outer_ipv4 = dp_packet_hwol_is_outer_ipv4(p);
+        tcp_hdr = dp_packet_inner_l4(p);
+        ip_hdr = dp_packet_inner_l3(p);
+        tnl = true;
+
+        if (outer_ipv4) {
+            outer_ip_id = ntohs(((struct ip_header *) dp_packet_l3(p))->ip_id);
+        }
+        if (dp_packet_hwol_is_ipv4(p)) {
+            inner_ip_id = ntohs(ip_hdr->ip_id);
+        }
+    } else {
+        outer_ipv4 = dp_packet_hwol_is_ipv4(p);
+        tcp_hdr = dp_packet_l4(p);
         ip_hdr = dp_packet_l3(p);
-        ip_id = ntohs(ip_hdr->ip_id);
+        tnl = false;
+
+        if (outer_ipv4) {
+            outer_ip_id = ntohs(ip_hdr->ip_id);
+        }
     }
 
+    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
+    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
+    hdr_len = ((char *) tcp_hdr - (char *) dp_packet_eth(p))
+              + tcp_offset * 4;
     const char *data_tail = (char *) dp_packet_tail(p)
                             - dp_packet_l2_pad_size(p);
-    const char *data_pos = dp_packet_get_tcp_payload(p);
+    const char *data_pos = (char *) tcp_hdr + tcp_offset * 4;
     int n_segs = dp_packet_gso_nr_segs(p);
 
     for (int i = 0; i < n_segs; i++) {
@@ -130,14 +157,36 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch 
**batches)
         seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len);
         data_pos += seg_len;
 
+        if (tnl) {
+            /* Update tunnel UDP header length. */
+            struct udp_header *tnl_hdr;
+
+            tnl_hdr = dp_packet_l4(seg);
+            tnl_hdr->udp_len = htons(dp_packet_l4_size(seg));
+
+            /* Update tunnel inner L3 header. */
+            if (dp_packet_hwol_is_ipv4(seg)) {
+                ip_hdr = dp_packet_inner_l3(seg);
+                ip_hdr->ip_tot_len = htons(dp_packet_inner_l3_size(seg));
+                ip_hdr->ip_id = htons(inner_ip_id);
+                ip_hdr->ip_csum = 0;
+                inner_ip_id++;
+            } else {
+                struct ovs_16aligned_ip6_hdr *ip6_hdr;
+
+                ip6_hdr = dp_packet_inner_l3(seg);
+                ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
+                    = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr);
+            }
+        }
+
         /* Update L3 header. */
-        if (dp_packet_hwol_is_ipv4(seg)) {
+        if (outer_ipv4) {
             ip_hdr = dp_packet_l3(seg);
-            ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
-                                       dp_packet_l4_size(seg));
-            ip_hdr->ip_id = htons(ip_id);
+            ip_hdr->ip_tot_len = htons(dp_packet_l3_size(seg));
+            ip_hdr->ip_id = htons(outer_ip_id);
             ip_hdr->ip_csum = 0;
-            ip_id++;
+            outer_ip_id++;
         } else {
             struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
 
@@ -146,7 +195,11 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch 
**batches)
         }
 
         /* Update L4 header. */
-        tcp_hdr = dp_packet_l4(seg);
+        if (tnl) {
+            tcp_hdr = dp_packet_inner_l4(seg);
+        } else {
+            tcp_hdr = dp_packet_l4(seg);
+        }
         put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
         tcp_seq += seg_len;
         if (OVS_LIKELY(i < (n_segs - 1))) {
diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index a75b1c5cd..af1ff088f 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -529,6 +529,16 @@ dp_packet_inner_l3(const struct dp_packet *b)
            : NULL;
 }
 
+static inline size_t
+dp_packet_inner_l3_size(const struct dp_packet *b)
+{
+    return OVS_LIKELY(b->inner_l3_ofs != UINT16_MAX)
+           ? (const char *) dp_packet_tail(b)
+           - (const char *) dp_packet_inner_l3(b)
+           - dp_packet_l2_pad_size(b)
+           : 0;
+}
+
 static inline void *
 dp_packet_inner_l4(const struct dp_packet *b)
 {
@@ -563,6 +573,22 @@ dp_packet_get_tcp_payload(const struct dp_packet *b)
     return NULL;
 }
 
+static inline const void *
+dp_packet_get_inner_tcp_payload(const struct dp_packet *b)
+{
+    size_t l4_size = dp_packet_inner_l4_size(b);
+
+    if (OVS_LIKELY(l4_size >= TCP_HEADER_LEN)) {
+        struct tcp_header *tcp = dp_packet_inner_l4(b);
+        int tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
+
+        if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= l4_size)) {
+            return (const char *) tcp + tcp_len;
+        }
+    }
+    return NULL;
+}
+
 static inline uint32_t
 dp_packet_get_tcp_payload_length(const struct dp_packet *pkt)
 {
@@ -1320,6 +1346,14 @@ dp_packet_hwol_reset_tcp_seg(struct dp_packet *p)
         ol_flags |= DP_PACKET_OL_TX_IP_CKSUM;
     }
 
+    if (ol_flags & (DP_PACKET_OL_TX_TUNNEL_VXLAN |
+                    DP_PACKET_OL_TX_TUNNEL_GENEVE)) {
+        if (ol_flags & DP_PACKET_OL_TX_OUTER_IPV4) {
+            ol_flags |= DP_PACKET_OL_TX_OUTER_IP_CKSUM;
+        }
+        ol_flags |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM;
+    }
+
     *dp_packet_ol_flags_ptr(p) = ol_flags;
 }
 
diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
index 0f9f07f44..22cc858e7 100644
--- a/lib/netdev-native-tnl.c
+++ b/lib/netdev-native-tnl.c
@@ -308,6 +308,14 @@ netdev_tnl_push_udp_header(const struct netdev *netdev 
OVS_UNUSED,
     if (l4_ofs != UINT16_MAX) {
         packet->inner_l4_ofs = l4_ofs + data->header_len;
     }
+
+    if (dp_packet_hwol_is_tso(packet)) {
+        uint16_t tso_segsz = dp_packet_get_tso_segsz(packet);
+        if (tso_segsz > data->header_len) {
+            tso_segsz -= data->header_len;
+            dp_packet_set_tso_segsz(packet, tso_segsz);
+        }
+    }
 }
 
 static void *
diff --git a/lib/netdev.c b/lib/netdev.c
index f2d921ed6..866dbf3b7 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -69,8 +69,6 @@ COVERAGE_DEFINE(netdev_received);
 COVERAGE_DEFINE(netdev_sent);
 COVERAGE_DEFINE(netdev_add_router);
 COVERAGE_DEFINE(netdev_get_stats);
-COVERAGE_DEFINE(netdev_vxlan_tso_drops);
-COVERAGE_DEFINE(netdev_geneve_tso_drops);
 COVERAGE_DEFINE(netdev_push_header_drops);
 COVERAGE_DEFINE(netdev_soft_seg_good);
 COVERAGE_DEFINE(netdev_soft_seg_drops);
@@ -910,28 +908,30 @@ netdev_send(struct netdev *netdev, int qid, struct 
dp_packet_batch *batch,
     struct dp_packet *packet;
     int error;
 
-    if (userspace_tso_enabled() &&
-        !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
-        DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
-            if (dp_packet_hwol_is_tso(packet)) {
-                if (dp_packet_hwol_is_tunnel_vxlan(packet)
-                    && !(netdev_flags & NETDEV_TX_VXLAN_TNL_TSO)) {
-                        VLOG_WARN_RL(&rl, "%s: No VXLAN TSO support",
-                                     netdev_get_name(netdev));
-                        COVERAGE_INC(netdev_vxlan_tso_drops);
-                        dp_packet_delete_batch(batch, true);
-                        return false;
+    if (userspace_tso_enabled()) {
+        if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
+            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+                if (dp_packet_hwol_is_tso(packet)) {
+                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
                 }
-
-                if (dp_packet_hwol_is_tunnel_geneve(packet)
-                    && !(netdev_flags & NETDEV_TX_GENEVE_TNL_TSO)) {
-                        VLOG_WARN_RL(&rl, "%s: No GENEVE TSO support",
-                                     netdev_get_name(netdev));
-                        COVERAGE_INC(netdev_geneve_tso_drops);
-                        dp_packet_delete_batch(batch, true);
-                        return false;
+            }
+        } else if (!(netdev_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) {
+            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+                if (dp_packet_hwol_is_tso(packet) &&
+                    (dp_packet_hwol_is_tunnel_vxlan(packet) ||
+                     dp_packet_hwol_is_tunnel_geneve(packet)) &&
+                    dp_packet_hwol_is_outer_udp_cksum(packet)) {
+                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+                }
+            }
+        } else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO |
+                                     NETDEV_TX_GENEVE_TNL_TSO))) {
+            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+                if (dp_packet_hwol_is_tso(packet) &&
+                    (dp_packet_hwol_is_tunnel_vxlan(packet) ||
+                     dp_packet_hwol_is_tunnel_geneve(packet))) {
+                    return netdev_send_tso(netdev, qid, batch, concurrent_txq);
                 }
-                return netdev_send_tso(netdev, qid, batch, concurrent_txq);
             }
         }
     }
diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index 3f1a15445..dd30b6bf5 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -351,6 +351,93 @@ OVS_WAIT_UNTIL([diff -q payload.bin udp_data])
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([datapath - tcp over vxlan tunnel with software fallback])
+AT_SKIP_IF([test $HAVE_NC = no])
+AT_SKIP_IF([test $HAVE_TCPDUMP = no])
+OVS_CHECK_VXLAN()
+
+dnl This test is only valid with tso. If the kernel segments the packets, the
+dnl packet lengths in the final test will be different.
+m4_ifndef([CHECK_SYSTEM_TSO], [AT_SKIP_IF(:)])
+
+OVS_TRAFFIC_VSWITCHD_START()
+ADD_BR([br-underlay])
+
+AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"])
+AT_CHECK([ovs-ofctl add-flow br-underlay "actions=normal"])
+
+ADD_NAMESPACES(at_ns0)
+
+dnl Set up underlay link from host into the namespace using veth pair.
+ADD_VETH(p0, at_ns0, br-underlay, "172.31.1.1/24")
+AT_CHECK([ip addr add dev br-underlay "172.31.1.100/24"])
+AT_CHECK([ip link set dev br-underlay up])
+
+dnl Test the case where only one side has all checksum and tso offload 
disabled.
+AT_CHECK([ethtool -K ovs-p0 tso off], [0], [ignore], [ignore])
+AT_CHECK([ethtool -K ovs-p0 sg off], [0], [ignore], [ignore])
+
+dnl Reinitialize.
+AT_CHECK([ovs-vsctl del-port ovs-p0])
+AT_CHECK([ovs-vsctl add-port br-underlay ovs-p0])
+
+dnl Set up tunnel endpoints on OVS outside the namespace and with a native
+dnl linux device inside the namespace.
+ADD_OVS_TUNNEL([vxlan], [br0], [at_vxlan0], [172.31.1.1], [10.1.1.100/24])
+ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], 
[10.1.1.1/24],
+                  [id 0 dstport 4789])
+
+dnl First, check the underlay.
+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], 
[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Check that the tunnel is up.
+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], 
[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Start tcpdump to capture the encapsulated packets.
+OVS_DAEMONIZE([tcpdump -i ovs-p0 -w p0.pcap], [tcpdump.pid])
+sleep 1
+
+dnl Initialize the listener before it is needed.
+NETNS_DAEMONIZE([at_ns0], [nc -l 10.1.1.1 1234 > data2], [nc.pid])
+
+dnl Verify that ncat is ready.
+OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :1234])])
+
+dnl Large TCP transfer aimed towards ovs-p0, which has TSO disabled.
+AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null])
+AT_CHECK([nc $NC_EOF_OPT 10.1.1.1 1234 < payload.bin])
+
+dnl Wait until transfer completes before checking.
+OVS_WAIT_WHILE([kill -0 $(cat nc.pid)])
+AT_CHECK([diff -q payload.bin data2], [0])
+
+dnl Stop OVS and tcpdump and verify the results.
+OVS_TRAFFIC_VSWITCHD_STOP
+
+AT_CHECK([kill -15 $(cat tcpdump.pid)])
+
+OVS_WAIT_WHILE([kill -0 $(cat tcpdump.pid)])
+
+ovs-pcap p0.pcap
+
+dnl The exact number of packets sent will vary, but we check that the largest 
segments have the correct
+dnl lengths and certain other fields.
+AT_CHECK([test $(ovs-pcap p0.pcap | grep -Ec dnl
+"^.{24}0800"dnl Ethernet
+"450005aa....4000..11....ac1f0164ac1f0101"dnl IP(len=1450, DF, UDP, 
172.31.1.100->172.31.1.1)
+"....12b505960000"dnl UDP(len=1430, dport=4789)
+"0800000000000000"dnl VXLAN(gpid=0, vni=0)
+".{24}0800"dnl Ethernet
+"45000578....4000..06....0a0101640a010101"dnl IP(len=1400, DF, TCP, 
10.1.1.100->10.1.1.1)
+"....04d2............................0000"dnl TCP(dport=1234
+) -ge 20])
+
+AT_CLEANUP
+
 AT_SETUP([datapath - ping vlan over vxlan tunnel])
 OVS_CHECK_TUNNEL_TSO()
 OVS_CHECK_VXLAN()
-- 
2.39.3

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to