This patch introduce TX tcp/udp-checksum offload support for DPDK pnic. The feature is disabled by default and can be enabled by setting tx-checksum-offload, which like: ovs-vsctl set Interface dpdk-eth3 \ options:tx-checksum-offload=true
Signed-off-by: Zhenyu Gao <[email protected]> --- lib/netdev-dpdk.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++-- vswitchd/vswitch.xml | 15 ++++++ 2 files changed, 156 insertions(+), 3 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index ea17b97..489688f 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -32,6 +32,7 @@ #include <rte_mbuf.h> #include <rte_meter.h> #include <rte_virtio_net.h> +#include <rte_ip.h> #include "dirs.h" #include "dp-packet.h" @@ -328,6 +329,7 @@ struct ingress_policer { enum dpdk_hw_ol_features { NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0, + NETDEV_TX_CHECKSUM_OFFLOAD = 1 << 1, }; struct netdev_dpdk { @@ -649,6 +651,8 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq) int diag = 0; int i; struct rte_eth_conf conf = port_conf; + struct rte_eth_txconf *txconf; + struct rte_eth_dev_info dev_info; if (dev->mtu > ETHER_MTU) { conf.rxmode.jumbo_frame = 1; @@ -676,9 +680,16 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq) break; } + rte_eth_dev_info_get(dev->port_id, &dev_info); + txconf = &dev_info.default_txconf; + if (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) { + /*Enable tx offload feature on pnic*/ + txconf->txq_flags = 0; + } + for (i = 0; i < n_txq; i++) { diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size, - dev->socket_id, NULL); + dev->socket_id, txconf); if (diag) { VLOG_INFO("Interface %s txq(%d) setup error: %s", dev->up.name, i, rte_strerror(-diag)); @@ -719,6 +730,29 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq) } static void +dpdk_eth_checksum_offload_configure(struct netdev_dpdk *dev) + OVS_REQUIRES(dev->mutex) +{ + struct rte_eth_dev_info info; + bool tx_csum_ol_flag = false; + uint32_t tx_chksm_offload_capa = DEV_TX_OFFLOAD_UDP_CKSUM | + DEV_TX_OFFLOAD_TCP_CKSUM; + + rte_eth_dev_info_get(dev->port_id, &info); + tx_csum_ol_flag = (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) != 0; + + if (tx_csum_ol_flag && + (info.tx_offload_capa & tx_chksm_offload_capa) != + tx_chksm_offload_capa) { + VLOG_WARN_ONCE("Tx checksum offload is not supported on device %"PRIu8, + dev->port_id); + dev->hw_ol_features &= ~NETDEV_TX_CHECKSUM_OFFLOAD; + } else { + netdev_request_reconfigure(&dev->up); + } +} + +static void dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex) { if (rte_eth_dev_flow_ctrl_set(dev->port_id, &dev->fc_conf)) { @@ -1108,6 +1142,11 @@ netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args) } else { smap_add(args, "rx_csum_offload", "false"); } + if (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) { + smap_add(args, "tx_csum_offload", "true"); + } else { + smap_add(args, "tx_csum_offload", "false"); + } } ovs_mutex_unlock(&dev->mutex); @@ -1198,6 +1237,9 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, {RTE_FC_NONE, RTE_FC_TX_PAUSE}, {RTE_FC_RX_PAUSE, RTE_FC_FULL } }; + bool tx_chksm_ofld; + bool temp_tx_flag; + bool change = false; const char *new_devargs; int err = 0; @@ -1279,6 +1321,19 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, dpdk_eth_flow_ctrl_setup(dev); } + /* Tx checksum offload configuration */ + /* By default the Tx checksum offload is ON */ + tx_chksm_ofld = smap_get_bool(args, "tx-checksum-offload", true); + temp_tx_flag = (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) + != 0; + if (temp_tx_flag != tx_chksm_ofld) { + dev->hw_ol_features ^= NETDEV_TX_CHECKSUM_OFFLOAD; + change = true; + } + + if (change) { + dpdk_eth_checksum_offload_configure(dev); + } out: ovs_mutex_unlock(&dev->mutex); ovs_mutex_unlock(&dpdk_mutex); @@ -1392,6 +1447,81 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq) rte_free(rx); } +static inline void +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt, + uint8_t l4_proto, bool is_ipv4) +{ + void *l3hdr = (void *)(data + pkt->l3_ofs); + if (l4_proto == IPPROTO_TCP) { + struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt->l4_ofs); + + pkt->mbuf.l2_len = pkt->l3_ofs; + pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs; + tcp_hdr->tcp_csum = 0; + if (is_ipv4) { + pkt->mbuf.ol_flags |= PKT_TX_TCP_CKSUM | PKT_TX_IPV4; + tcp_hdr->tcp_csum = rte_ipv4_phdr_cksum(l3hdr, + pkt->mbuf.ol_flags); + } else { + pkt->mbuf.ol_flags |= PKT_TX_TCP_CKSUM | PKT_TX_IPV6; + tcp_hdr->tcp_csum = rte_ipv6_phdr_cksum(l3hdr, + pkt->mbuf.ol_flags); + + } + } else if (l4_proto == IPPROTO_UDP) { + struct udp_header *udp_hdr = (struct udp_header *)(data + pkt->l4_ofs); + /* do not recalculate udp cksum if it was 0 */ + if (udp_hdr->udp_csum != 0) { + pkt->mbuf.l2_len = pkt->l3_ofs; + pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs; + udp_hdr->udp_csum = 0; + if (is_ipv4) { + /*do not calculate udp cksum if it was a fragment IP*/ + if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)-> + fragment_offset)) { + return; + } + pkt->mbuf.ol_flags |= PKT_TX_UDP_CKSUM | PKT_TX_IPV4; + udp_hdr->udp_csum = rte_ipv4_phdr_cksum(l3hdr, + pkt->mbuf.ol_flags); + } else { + pkt->mbuf.ol_flags |= PKT_TX_UDP_CKSUM | PKT_TX_IPV6; + udp_hdr->udp_csum = rte_ipv6_phdr_cksum(l3hdr, + pkt->mbuf.ol_flags); + } + } + } +} + +static inline void +netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt) +{ + int i = 0; + + for (i = 0; i < pkt_cnt; i++) { + ovs_be16 dl_type; + struct dp_packet *pkt = (struct dp_packet *)pkts[i]; + const char *data = dp_packet_data(pkt); + void *l3hdr = (char *)(data + pkt->l3_ofs); + + if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) { + continue; + } + + dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2); + if (dl_type == htons(ETH_TYPE_IP)) { + netdev_refill_l4_cksum(data, pkt, + ((struct ipv4_hdr *)l3hdr)->next_proto_id, + true); + } else if (dl_type == htons(ETH_TYPE_IPV6)) { + netdev_refill_l4_cksum(data, pkt, + ((struct ipv6_hdr *)l3hdr)->proto, + false); + } + + } +} + /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of * 'pkts', even in case of failure. * @@ -1780,6 +1910,11 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) /* We have to do a copy for now */ memcpy(rte_pktmbuf_mtod(pkts[newcnt], void *), dp_packet_data(batch->packets[i]), size); + if (batch->packets[i]->mbuf.ol_flags & PKT_TX_TCP_CKSUM) { + pkts[newcnt]->l2_len = batch->packets[i]->mbuf.l2_len; + pkts[newcnt]->l3_len = batch->packets[i]->mbuf.l3_len; + pkts[newcnt]->ol_flags = batch->packets[i]->mbuf.ol_flags; + } rte_pktmbuf_data_len(pkts[newcnt]) = size; rte_pktmbuf_pkt_len(pkts[newcnt]) = size; @@ -1833,6 +1968,10 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid, return; } + if (dev->hw_ol_features & NETDEV_TX_CHECKSUM_OFFLOAD) { + netdev_prepare_tx_csum(batch->packets, batch->count); + } + if (OVS_UNLIKELY(concurrent_txq)) { qid = qid % dev->up.n_txq; rte_spinlock_lock(&dev->tx_q[qid].tx_lock); @@ -2741,8 +2880,7 @@ netdev_dpdk_vhost_class_init(void) if (ovsthread_once_start(&once)) { rte_vhost_driver_callback_register(&virtio_net_device_ops); rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4 - | 1ULL << VIRTIO_NET_F_HOST_TSO6 - | 1ULL << VIRTIO_NET_F_CSUM); + | 1ULL << VIRTIO_NET_F_HOST_TSO6); ovs_thread_create("vhost_thread", start_vhost_loop, NULL); ovsthread_once_done(&once); diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 883ecd8..08e8d1d 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3480,6 +3480,21 @@ </column> </group> + <group title="TX Checksum Offload Configuration"> + <p> + The checksum validation on the outgoing packets are + performed on NIC using TX checksum offload feature. Implemented only + for <code>dpdk + </code>physical interfaces. + </p> + + <column name="options" key="tx-checksum-offload" + type='{"type": "boolean"}'> + Set to <code>false</code> to disble Tx checksum offloading on <code> + dpdk</code>physical ports. By default, Tx checksum offload is enabled. + </column> + </group> + <group title="Common Columns"> The overall purpose of these columns is described under <code>Common Columns</code> at the beginning of this document. -- 1.8.3.1 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
