This patch introduce TX tcp/udp-checksum offload support for DPDK pnic.
The feature is disabled by default and can be enabled by setting
tx-checksum-offload, which like:
ovs-vsctl set Interface dpdk-eth3 \
 options:tx-checksum-offload=true

Signed-off-by: Zhenyu Gao <[email protected]>
---
 lib/netdev-dpdk.c    | 144 +++++++++++++++++++++++++++++++++++++++++++++++++--
 vswitchd/vswitch.xml |  15 ++++++
 2 files changed, 156 insertions(+), 3 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index ea17b97..489688f 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -32,6 +32,7 @@
 #include <rte_mbuf.h>
 #include <rte_meter.h>
 #include <rte_virtio_net.h>
+#include <rte_ip.h>
 
 #include "dirs.h"
 #include "dp-packet.h"
@@ -328,6 +329,7 @@ struct ingress_policer {
 
 enum dpdk_hw_ol_features {
     NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
+    NETDEV_TX_CHECKSUM_OFFLOAD = 1 << 1,
 };
 
 struct netdev_dpdk {
@@ -649,6 +651,8 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int 
n_rxq, int n_txq)
     int diag = 0;
     int i;
     struct rte_eth_conf conf = port_conf;
+    struct rte_eth_txconf *txconf;
+    struct rte_eth_dev_info dev_info;
 
     if (dev->mtu > ETHER_MTU) {
         conf.rxmode.jumbo_frame = 1;
@@ -676,9 +680,16 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int 
n_rxq, int n_txq)
             break;
         }
 
+        rte_eth_dev_info_get(dev->port_id, &dev_info);
+        txconf = &dev_info.default_txconf;
+        if (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) {
+            /*Enable tx offload feature on pnic*/
+            txconf->txq_flags = 0;
+        }
+
         for (i = 0; i < n_txq; i++) {
             diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
-                                          dev->socket_id, NULL);
+                                          dev->socket_id, txconf);
             if (diag) {
                 VLOG_INFO("Interface %s txq(%d) setup error: %s",
                           dev->up.name, i, rte_strerror(-diag));
@@ -719,6 +730,29 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int 
n_rxq, int n_txq)
 }
 
 static void
+dpdk_eth_checksum_offload_configure(struct netdev_dpdk *dev)
+    OVS_REQUIRES(dev->mutex)
+{
+    struct rte_eth_dev_info info;
+    bool tx_csum_ol_flag = false;
+    uint32_t tx_chksm_offload_capa = DEV_TX_OFFLOAD_UDP_CKSUM |
+                                     DEV_TX_OFFLOAD_TCP_CKSUM;
+
+    rte_eth_dev_info_get(dev->port_id, &info);
+    tx_csum_ol_flag = (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) != 0;
+
+    if (tx_csum_ol_flag &&
+        (info.tx_offload_capa & tx_chksm_offload_capa) !=
+            tx_chksm_offload_capa) {
+        VLOG_WARN_ONCE("Tx checksum offload is not supported on device %"PRIu8,
+                       dev->port_id);
+        dev->hw_ol_features &= ~NETDEV_TX_CHECKSUM_OFFLOAD;
+    } else {
+        netdev_request_reconfigure(&dev->up);
+    }
+}
+
+static void
 dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
 {
     if (rte_eth_dev_flow_ctrl_set(dev->port_id, &dev->fc_conf)) {
@@ -1108,6 +1142,11 @@ netdev_dpdk_get_config(const struct netdev *netdev, 
struct smap *args)
         } else {
             smap_add(args, "rx_csum_offload", "false");
         }
+        if (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) {
+            smap_add(args, "tx_csum_offload", "true");
+        } else {
+            smap_add(args, "tx_csum_offload", "false");
+        }
     }
     ovs_mutex_unlock(&dev->mutex);
 
@@ -1198,6 +1237,9 @@ netdev_dpdk_set_config(struct netdev *netdev, const 
struct smap *args,
         {RTE_FC_NONE,     RTE_FC_TX_PAUSE},
         {RTE_FC_RX_PAUSE, RTE_FC_FULL    }
     };
+    bool tx_chksm_ofld;
+    bool temp_tx_flag;
+    bool change = false;
     const char *new_devargs;
     int err = 0;
 
@@ -1279,6 +1321,19 @@ netdev_dpdk_set_config(struct netdev *netdev, const 
struct smap *args,
         dpdk_eth_flow_ctrl_setup(dev);
     }
 
+    /* Tx checksum offload configuration */
+    /* By default the Tx checksum offload is ON */
+    tx_chksm_ofld = smap_get_bool(args, "tx-checksum-offload", true);
+    temp_tx_flag = (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD)
+                        != 0;
+    if (temp_tx_flag != tx_chksm_ofld) {
+        dev->hw_ol_features ^= NETDEV_TX_CHECKSUM_OFFLOAD;
+        change = true;
+    }
+
+    if (change) {
+        dpdk_eth_checksum_offload_configure(dev);
+    }
 out:
     ovs_mutex_unlock(&dev->mutex);
     ovs_mutex_unlock(&dpdk_mutex);
@@ -1392,6 +1447,81 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
     rte_free(rx);
 }
 
+static inline void
+netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
+                       uint8_t l4_proto, bool is_ipv4)
+{
+    void *l3hdr = (void *)(data + pkt->l3_ofs);
+    if (l4_proto == IPPROTO_TCP) {
+        struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt->l4_ofs);
+
+        pkt->mbuf.l2_len = pkt->l3_ofs;
+        pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
+        tcp_hdr->tcp_csum = 0;
+        if (is_ipv4) {
+            pkt->mbuf.ol_flags |= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
+            tcp_hdr->tcp_csum = rte_ipv4_phdr_cksum(l3hdr,
+                                                    pkt->mbuf.ol_flags);
+        } else {
+            pkt->mbuf.ol_flags |= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
+            tcp_hdr->tcp_csum = rte_ipv6_phdr_cksum(l3hdr,
+                                                    pkt->mbuf.ol_flags);
+
+        }
+    } else if (l4_proto == IPPROTO_UDP) {
+        struct udp_header *udp_hdr = (struct udp_header *)(data + pkt->l4_ofs);
+        /* do not recalculate udp cksum if it was 0 */
+        if (udp_hdr->udp_csum != 0) {
+            pkt->mbuf.l2_len = pkt->l3_ofs;
+            pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
+            udp_hdr->udp_csum = 0;
+            if (is_ipv4) {
+                /*do not calculate udp cksum if it was a fragment IP*/
+                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
+                                      fragment_offset)) {
+                    return;
+                }
+                pkt->mbuf.ol_flags |= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;
+                udp_hdr->udp_csum = rte_ipv4_phdr_cksum(l3hdr,
+                                                        pkt->mbuf.ol_flags);
+            } else {
+                pkt->mbuf.ol_flags |= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;
+                udp_hdr->udp_csum = rte_ipv6_phdr_cksum(l3hdr,
+                                                        pkt->mbuf.ol_flags);
+            }
+        }
+    }
+}
+
+static inline void
+netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
+{
+    int i = 0;
+
+    for (i = 0; i < pkt_cnt; i++) {
+        ovs_be16 dl_type;
+        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
+        const char *data = dp_packet_data(pkt);
+        void *l3hdr = (char *)(data + pkt->l3_ofs);
+
+        if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
+            continue;
+        }
+
+        dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
+        if (dl_type == htons(ETH_TYPE_IP)) {
+            netdev_refill_l4_cksum(data, pkt,
+                                   ((struct ipv4_hdr *)l3hdr)->next_proto_id,
+                                   true);
+        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
+            netdev_refill_l4_cksum(data, pkt,
+                                   ((struct ipv6_hdr *)l3hdr)->proto,
+                                   false);
+        }
+
+    }
+}
+
 /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
  * 'pkts', even in case of failure.
  *
@@ -1780,6 +1910,11 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct 
dp_packet_batch *batch)
         /* We have to do a copy for now */
         memcpy(rte_pktmbuf_mtod(pkts[newcnt], void *),
                dp_packet_data(batch->packets[i]), size);
+        if (batch->packets[i]->mbuf.ol_flags & PKT_TX_TCP_CKSUM) {
+            pkts[newcnt]->l2_len = batch->packets[i]->mbuf.l2_len;
+            pkts[newcnt]->l3_len = batch->packets[i]->mbuf.l3_len;
+            pkts[newcnt]->ol_flags = batch->packets[i]->mbuf.ol_flags;
+        }
 
         rte_pktmbuf_data_len(pkts[newcnt]) = size;
         rte_pktmbuf_pkt_len(pkts[newcnt]) = size;
@@ -1833,6 +1968,10 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
         return;
     }
 
+    if (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) {
+        netdev_prepare_tx_csum(batch->packets, batch->count);
+    }
+
     if (OVS_UNLIKELY(concurrent_txq)) {
         qid = qid % dev->up.n_txq;
         rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
@@ -2741,8 +2880,7 @@ netdev_dpdk_vhost_class_init(void)
     if (ovsthread_once_start(&once)) {
         rte_vhost_driver_callback_register(&virtio_net_device_ops);
         rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
-                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                                  | 1ULL << VIRTIO_NET_F_CSUM);
+                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6);
         ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
 
         ovsthread_once_done(&once);
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 883ecd8..08e8d1d 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -3480,6 +3480,21 @@
       </column>
     </group>
 
+    <group title="TX Checksum Offload Configuration">
+      <p>
+        The checksum validation on the outgoing packets are
+        performed on NIC using TX checksum offload feature. Implemented only
+        for <code>dpdk
+        </code>physical interfaces.
+      </p>
+
+      <column name="options" key="tx-checksum-offload"
+              type='{"type": "boolean"}'>
+        Set to <code>false</code> to disble Tx checksum offloading on <code>
+        dpdk</code>physical ports. By default, Tx checksum offload is enabled.
+      </column>
+    </group>
+
     <group title="Common Columns">
       The overall purpose of these columns is described under <code>Common
       Columns</code> at the beginning of this document.
-- 
1.8.3.1

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to