TCP Segmentation Offload significantly improves performance for TCP
workload in virtualized environment. Achieved throughput with
iperf3 tool between VM's on same host connected to VHU with TSO is
more than three times higher than without.
Use case description:
VMs are deployed by Openstack Kilo on VLAN and flat networks.
We are using two Openstack computes and testing conectivity between VMs
on same compute and on different computes. Since setup is focused on
TCP workload, performance and connectivity is tested with iperf3 tool.
* Test scenarios:
VM to VM on same host
VM to VM on different hosts
VM to VM connected with VLAN network
Changes included in this patch:
* Change of max_packet_len
before max_packet_len was calculated base on MTU, now it is 64K to
handle big TCP frames
* Update mbuf fields base on DPDK API requirements
update ol_flags if segmentation is needed
update checksum fields
VLAN awareness update header length
* Enable TSO feature in vhost init function
* Override default txq_flags to enable offloading features in PMD
Concerns:
* Impact of changing max_packet_len to 64K in other use cases
* TSO is enabled by default - do we need additional feature for turning
off TSO port-base or VM-base (from Virtualized Infrastructure
Management - e.g. Openstack)
I am interested in general design comments and concerns listed above
Co-authored-by: Mark Kavanagh <[email protected]>
Co-authored-by: Yuanhan Liu <[email protected]>
Co-authored-by: Przemyslaw Lal <[email protected]>
Signed-off-by: Marcin Ksiadz <[email protected]>
Signed-off-by: Mark Kavanagh <[email protected]>
Signed-off-by: Przemyslaw Lal <[email protected]>
Signed-off-by: Yuanhan Liu <[email protected]>
---
lib/dp-packet.h | 30 +++++++++++++--------
lib/netdev-dpdk.c | 49 ++++++++++++++++++++++++++++++----
lib/packets.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 143 insertions(+), 16 deletions(-)
diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 118c84d..d735c9f 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -24,6 +24,10 @@
#include "util.h"
#include "netdev-dpdk.h"
+#ifdef DPDK_NETDEV
+#include "rte_ether.h"
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -408,18 +412,22 @@ dp_packet_size(const struct dp_packet *b)
static inline void
dp_packet_set_size(struct dp_packet *b, uint32_t v)
{
- /* netdev-dpdk does not currently support segmentation; consequently, for
- * all intents and purposes, 'data_len' (16 bit) and 'pkt_len' (32 bit) may
- * be used interchangably.
- *
- * On the datapath, it is expected that the size of packets
- * (and thus 'v') will always be <= UINT16_MAX; this means that there is no
- * loss of accuracy in assigning 'v' to 'data_len'.
+ /*
+ * Assign current segment length. If total length is greater than
+ * ETHER_MAX_LEN additional calculation is needed
*/
- b->mbuf.data_len = (uint16_t)v; /* Current seg length. */
- b->mbuf.pkt_len = v; /* Total length of all segments linked to
- * this segment. */
-}
+ if (v > ETHER_MAX_LEN) {
+ b->mbuf.data_len =
+ (uint16_t)(b->mbuf.buf_len - b->mbuf.data_off);
+ } else {
+ b->mbuf.data_len = (uint16_t)v;
+ }
+
+ /*
+ * Total length of all segments linked to
+ * this segment.
+ */
+ b->mbuf.pkt_len = v;}
static inline uint16_t
__packet_data(const struct dp_packet *b)
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 19d355f..cce85c2 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -55,8 +55,10 @@
#include "unixctl.h"
#include "rte_config.h"
+#include "rte_ip.h"
#include "rte_mbuf.h"
#include "rte_meter.h"
+#include "rte_tcp.h"
#include "rte_virtio_net.h"
VLOG_DEFINE_THIS_MODULE(dpdk);
@@ -581,6 +583,8 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int
n_rxq, int n_txq)
{
int diag = 0;
int i;
+ struct rte_eth_dev_info dev_info;
+ struct rte_eth_txconf *txconf;
/* A device may report more queues than it makes available (this has
* been observed for Intel xl710, which reserves some of them for
@@ -597,9 +601,17 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int
n_rxq, int n_txq)
break;
}
+ rte_eth_dev_info_get(dev->port_id, &dev_info);
+ txconf = &dev_info.default_txconf;
+ /*
+ * The default txq_flags disables offloading features. Setting
+ * it to 0 to enable it.
+ */
+ txconf->txq_flags = 0;
+
for (i = 0; i < n_txq; i++) {
diag = rte_eth_tx_queue_setup(dev->port_id, i, NIC_PORT_TX_Q_SIZE,
- dev->socket_id, NULL);
+ dev->socket_id, txconf);
if (diag) {
VLOG_INFO("Interface %s txq(%d) setup error: %s",
dev->up.name, i, rte_strerror(-diag));
@@ -765,7 +777,13 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int
port_no,
dev->type = type;
dev->flags = 0;
dev->mtu = ETHER_MTU;
- dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
+
+ /* XXX: max_packet_len needs to be set to 64k to accommodate traversal of
+ * oversized packets from the guest across a port, but MTU remains
+ * unchanged - netdev's state is therefore somewhat inconsistent,
+ * as max_packet_len is typically a function of MTU.
+ */
+ dev->max_packet_len = 64 * 1024;
buf_size = dpdk_buf_size(dev->mtu);
dev->dpdk_mp = dpdk_mp_get(dev->socket_id, FRAME_LEN_TO_MTU(buf_size));
@@ -1089,6 +1107,30 @@ dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
struct dpdk_tx_queue *txq = &dev->tx_q[qid];
uint32_t nb_tx = 0;
+ for (int i = 0; i < txq->count; i++) {
+ struct rte_mbuf *m = txq->burst_pkts[i];
+
+ if (m->ol_flags & PKT_TX_TCP_SEG) {
+ void *l3_hdr;
+ struct tcp_hdr *tcp_hdr;
+ struct ether_hdr *eth_hdr;
+
+ eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
+ l3_hdr = (char *)eth_hdr + m->l2_len;
+
+ tcp_hdr = (struct tcp_hdr *)((char*)l3_hdr+ m->l3_len);
+ if (m->ol_flags & PKT_TX_IPV4) {
+ m->ol_flags |= PKT_TX_IP_CKSUM;
+ ((struct ipv4_hdr*)l3_hdr)->hdr_checksum = 0;
+ tcp_hdr->cksum = rte_ipv4_phdr_cksum(
+ (struct ipv4_hdr*)l3_hdr, m->ol_flags);
+ } else {
+ tcp_hdr->cksum = rte_ipv6_phdr_cksum(
+ (struct ipv6_hdr*)l3_hdr, m->ol_flags);
+ }
+ }
+ }
+
while (nb_tx != txq->count) {
uint32_t ret;
@@ -2519,9 +2561,6 @@ static int
dpdk_vhost_class_init(void)
{
rte_vhost_driver_callback_register(&virtio_net_device_ops);
- rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
- | 1ULL << VIRTIO_NET_F_HOST_TSO6
- | 1ULL << VIRTIO_NET_F_CSUM);
ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
return 0;
diff --git a/lib/packets.c b/lib/packets.c
index 43b5a70..864c752 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -12,6 +12,39 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
+ *
+ *
+ * This file incorporates work covered by the following copyright and
+ * permission notice:
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <config.h>
@@ -33,6 +66,13 @@
#include "dp-packet.h"
#include "unaligned.h"
+#ifdef DPDK_NETDEV
+#include "rte_ip.h"
+#include "rte_mbuf.h"
+#include "rte_tcp.h"
+#include "rte_udp.h"
+#endif
+
const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT;
const struct in6_addr in6addr_all_hosts = IN6ADDR_ALL_HOSTS_INIT;
@@ -187,6 +227,36 @@ compose_rarp(struct dp_packet *b, const struct eth_addr
eth_src)
dp_packet_set_l3(b, arp);
}
+#ifdef DPDK_NETDEV
+static void
+update_mbuf_data(struct rte_mbuf *m, uint16_t ethertype)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ void *l3_hdr = NULL;
+ void* eth_hdr = NULL;
+
+ eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
+
+ m->l2_len += sizeof(struct vlan_hdr);
+ l3_hdr = (char *)eth_hdr + m->l2_len;
+
+ switch (ethertype) {
+ case ETHER_TYPE_IPv4:
+ ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
+ m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
+ m->ol_flags |= PKT_TX_IPV4;
+ break;
+ case ETHER_TYPE_IPv6:
+ m->l3_len = sizeof(struct ipv6_hdr);
+ m->ol_flags |= PKT_TX_IPV6;
+ break;
+ default:
+ m->l3_len = 0;
+ break;
+ }
+}
+#endif
+
/* Insert VLAN header according to given TCI. Packet passed must be Ethernet
* packet. Ignores the CFI bit of 'tci' using 0 instead.
*
@@ -201,6 +271,11 @@ eth_push_vlan(struct dp_packet *packet, ovs_be16 tpid,
ovs_be16 tci)
memmove(veh, (char *)veh + VLAN_HEADER_LEN, 2 * ETH_ADDR_LEN);
veh->veth_type = tpid;
veh->veth_tci = tci & htons(~VLAN_CFI);
+
+#ifdef DPDK_NETDEV
+ update_mbuf_data((&packet->mbuf), rte_be_to_cpu_16(veh->veth_next_type));
+#endif
+
}
/* Removes outermost VLAN header (if any is present) from 'packet'.
@@ -217,6 +292,11 @@ eth_pop_vlan(struct dp_packet *packet)
memmove((char *)veh + VLAN_HEADER_LEN, veh, 2 * ETH_ADDR_LEN);
dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
+
+#ifdef DPDK_NETDEV
+ packet->mbuf.l2_len -= sizeof(struct vlan_hdr);
+#endif
+
}
}
--
2.8.2
_______________________________________________
dev mailing list
[email protected]
http://openvswitch.org/mailman/listinfo/dev