This patch introduce TX tcp-checksum offload support for DPDK pnic. The feature is disabled by default and can be enabled by setting tx-checksum-offload, which like: ovs-vsctl set Interface dpdk-eth3 \ options:tx-checksum-offload=true --- lib/netdev-dpdk.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++---- vswitchd/vswitch.xml | 13 ++++-- 2 files changed, 115 insertions(+), 10 deletions(-)
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index bba4de3..5a68a48 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -32,6 +32,7 @@ #include <rte_mbuf.h> #include <rte_meter.h> #include <rte_virtio_net.h> +#include <rte_ip.h> #include "dirs.h" #include "dp-packet.h" @@ -328,6 +329,7 @@ struct ingress_policer { enum dpdk_hw_ol_features { NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0, + NETDEV_TX_CHECKSUM_OFFLOAD = 1 << 1, }; struct netdev_dpdk { @@ -649,6 +651,8 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq) int diag = 0; int i; struct rte_eth_conf conf = port_conf; + struct rte_eth_txconf *txconf; + struct rte_eth_dev_info dev_info; if (dev->mtu > ETHER_MTU) { conf.rxmode.jumbo_frame = 1; @@ -676,9 +680,16 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq) break; } + rte_eth_dev_info_get(dev->port_id, &dev_info); + txconf = &dev_info.default_txconf; + if (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) { + /*Enable tx offload feature on pnic*/ + txconf->txq_flags = 0; + } + for (i = 0; i < n_txq; i++) { diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size, - dev->socket_id, NULL); + dev->socket_id, txconf); if (diag) { VLOG_INFO("Interface %s txq(%d) setup error: %s", dev->up.name, i, rte_strerror(-diag)); @@ -724,11 +735,15 @@ dpdk_eth_checksum_offload_configure(struct netdev_dpdk *dev) { struct rte_eth_dev_info info; bool rx_csum_ol_flag = false; + bool tx_csum_ol_flag = false; uint32_t rx_chksm_offload_capa = DEV_RX_OFFLOAD_UDP_CKSUM | DEV_RX_OFFLOAD_TCP_CKSUM | DEV_RX_OFFLOAD_IPV4_CKSUM; + uint32_t tx_chksm_offload_capa = DEV_TX_OFFLOAD_TCP_CKSUM; + rte_eth_dev_info_get(dev->port_id, &info); rx_csum_ol_flag = (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) != 0; + tx_csum_ol_flag = (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) != 0; if (rx_csum_ol_flag && (info.rx_offload_capa & rx_chksm_offload_capa) != @@ -736,9 +751,15 @@ dpdk_eth_checksum_offload_configure(struct netdev_dpdk *dev) VLOG_WARN_ONCE("Rx checksum offload is not supported on device %"PRIu8, dev->port_id); dev->hw_ol_features &= ~NETDEV_RX_CHECKSUM_OFFLOAD; - return; + } else if (tx_csum_ol_flag && + (info.tx_offload_capa & tx_chksm_offload_capa) != + tx_chksm_offload_capa) { + VLOG_WARN_ONCE("Tx checksum offload is not supported on device %"PRIu8, + dev->port_id); + dev->hw_ol_features &= ~NETDEV_TX_CHECKSUM_OFFLOAD; + } else { + netdev_request_reconfigure(&dev->up); } - netdev_request_reconfigure(&dev->up); } static void @@ -1119,6 +1140,11 @@ netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args) } else { smap_add(args, "rx_csum_offload", "false"); } + if (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) { + smap_add(args, "tx_csum_offload", "true"); + } else { + smap_add(args, "tx_csum_offload", "false"); + } } ovs_mutex_unlock(&dev->mutex); @@ -1210,7 +1236,10 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, {RTE_FC_RX_PAUSE, RTE_FC_FULL } }; bool rx_chksm_ofld; - bool temp_flag; + bool tx_chksm_ofld; + bool temp_rx_flag; + bool temp_tx_flag; + bool change = false; const char *new_devargs; int err = 0; @@ -1295,13 +1324,24 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, /* Rx checksum offload configuration */ /* By default the Rx checksum offload is ON */ rx_chksm_ofld = smap_get_bool(args, "rx-checksum-offload", true); - temp_flag = (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) + tx_chksm_ofld = smap_get_bool(args, "tx-checksum-offload", false); + temp_rx_flag = (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) != 0; - if (temp_flag != rx_chksm_ofld) { + temp_tx_flag = (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) + != 0; + if (temp_rx_flag != rx_chksm_ofld) { dev->hw_ol_features ^= NETDEV_RX_CHECKSUM_OFFLOAD; - dpdk_eth_checksum_offload_configure(dev); + change = true; } + if (temp_tx_flag != tx_chksm_ofld) { + dev->hw_ol_features ^= NETDEV_TX_CHECKSUM_OFFLOAD; + change = true; + } + + if (change) { + dpdk_eth_checksum_offload_configure(dev); + } out: ovs_mutex_unlock(&dev->mutex); ovs_mutex_unlock(&dpdk_mutex); @@ -1415,6 +1455,55 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq) rte_free(rx); } +static inline void +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt) +{ + int i = 0; + + for (i = 0; i < pkt_cnt; i++) { + ovs_be16 dl_type; + struct dp_packet *pkt = (struct dp_packet*)pkts[i]; + const char *data = dp_packet_data(pkt); + char *l3hdr = (char*)(data + pkt->l3_ofs); + + if (pkt->l3_ofs == UINT16_MAX) { + continue; + } + + if (pkt->l4_ofs == UINT16_MAX) { + continue; + } + + + dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2); + if (dl_type == htons(ETH_TYPE_IP)) { + + if (((struct ipv4_hdr*)l3hdr)->next_proto_id == IPPROTO_TCP) { + struct tcp_header *tcp_hdr = (struct tcp_header*)(data + pkt->l4_ofs); + + pkt->mbuf.l2_len = pkt->l3_ofs; + pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs; + pkt->mbuf.ol_flags |= PKT_TX_TCP_CKSUM|PKT_TX_IPV4; + tcp_hdr->tcp_csum = 0; + tcp_hdr->tcp_csum = rte_ipv4_phdr_cksum((struct ipv4_hdr*)l3hdr, + pkt->mbuf.ol_flags); + } + } else if (dl_type == htons(ETH_TYPE_IPV6)) { + if (((struct ipv6_hdr*)l3hdr)->proto == IPPROTO_TCP) { + struct tcp_header *tcp_hdr = (struct tcp_header*)(data + pkt->l4_ofs); + + pkt->mbuf.l2_len = pkt->l3_ofs; + pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs; + pkt->mbuf.ol_flags |= PKT_TX_TCP_CKSUM|PKT_TX_IPV6; + tcp_hdr->tcp_csum = 0; + tcp_hdr->tcp_csum = rte_ipv6_phdr_cksum((struct ipv6_hdr*)l3hdr, + pkt->mbuf.ol_flags); + } + } + + } +} + /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of * 'pkts', even in case of failure. * @@ -1803,6 +1892,11 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) /* We have to do a copy for now */ memcpy(rte_pktmbuf_mtod(pkts[newcnt], void *), dp_packet_data(batch->packets[i]), size); + if (batch->packets[i]->mbuf.ol_flags & PKT_TX_TCP_CKSUM) { + pkts[newcnt]->l2_len = batch->packets[i]->mbuf.l2_len; + pkts[newcnt]->l3_len = batch->packets[i]->mbuf.l3_len; + pkts[newcnt]->ol_flags = batch->packets[i]->mbuf.ol_flags; + } rte_pktmbuf_data_len(pkts[newcnt]) = size; rte_pktmbuf_pkt_len(pkts[newcnt]) = size; @@ -1861,6 +1955,10 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid, rte_spinlock_lock(&dev->tx_q[qid].tx_lock); } + if (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) { + netdev_prepare_tx_csum(batch->packets, batch->count); + } + if (OVS_UNLIKELY(!may_steal || batch->packets[0]->source != DPBUF_DPDK)) { struct netdev *netdev = &dev->up; diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 9bb828f..826cf15 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3415,10 +3415,11 @@ </column> </group> - <group title="Rx Checksum Offload Configuration"> + <group title="Rx/TX Checksum Offload Configuration"> <p> - The checksum validation on the incoming packets are performed on NIC - using Rx checksum offload feature. Implemented only for <code>dpdk + The checksum validation on the incoming/outgoing packets are + performed on NIC using Rx/TX checksum offload feature. Implemented only + for <code>dpdk </code>physical interfaces. </p> @@ -3427,6 +3428,12 @@ Set to <code>false</code> to disble Rx checksum offloading on <code> dpdk</code>physical ports. By default, Rx checksum offload is enabled. </column> + + <column name="options" key="tx-checksum-offload" + type='{"type": "boolean"}'> + Set to <code>false</code> to disble Tx checksum offloading on <code> + dpdk</code>physical ports. By default, Tx checksum offload is disabled. + </column> </group> <group title="Common Columns"> -- 1.8.3.1 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev