From: Yi Yang <[email protected]>
We can avoid high system call overhead by using TPACKET_V1/V2/V3
and use DPDK-like poll to receive and send packets (Note: send
still needs to call sendto to trigger final packet transmission).
I can see about 30% improvement compared to last recvmmsg
optimization if I use TPACKET_V3. TPACKET_V1/V2 is worse than
TPACKET_V3, but it still can improve about 20%.
For veth, it is 1.47 Gbps before this patch, it is about 1.98
Gbps after applied this patch. But it is about 4.00 Gbps if we
use af_packet for veth, the bottle neck lies in ovs-vswitchd
thread, it will handle too many things for every loop (as below)
, so it can't work very efficintly as pmd_thread.
memory_run();
bridge_run();
unixctl_server_run(unixctl);
netdev_run();
memory_wait();
bridge_wait();
unixctl_server_wait(unixctl);
netdev_wait();
poll_block();
In the next step, it will be better if let pmd_thread to handle
tap and veth interface.
Signed-off-by: Yi Yang <[email protected]>
Co-authored-by: William Tu <[email protected]>
Signed-off-by: William Tu <[email protected]>
---
acinclude.m4 | 23 +++
configure.ac | 1 +
lib/netdev-linux-private.h | 27 +++
lib/netdev-linux.c | 481 ++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 527 insertions(+), 5 deletions(-)
diff --git a/acinclude.m4 b/acinclude.m4
index c1470cc..e99aff1 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -1095,6 +1095,29 @@ AC_DEFUN([OVS_CHECK_IF_DL],
AC_SEARCH_LIBS([pcap_open_live], [pcap])
fi])
+dnl OVS_CHECK_LINUX_TPACKET
+dnl
+dnl Configure Linux TPACKET.
+AC_DEFUN([OVS_CHECK_LINUX_TPACKET], [
+ AC_CHECK_HEADER([linux/if_packet.h],
+ [HAVE_TPACKET=yes],
+ [HAVE_TPACKET=no])
+ AM_CONDITIONAL([HAVE_TPACKET], [test "$HAVE_TPACKET" = yes])
+ if test "$HAVE_TPACKET" = yes; then
+ AC_DEFINE([HAVE_TPACKET], [1],
+ [Define to 1 if linux/if_packet.h is available.])
+ OVS_GREP_IFELSE([/usr/include/linux/if_packet.h], [struct tpacket3_hdr ],
+ [AC_DEFINE([HAVE_TPACKET_V3], [1],
+ [Define to 1 if struct tpacket3_hdr is defined])])
+ OVS_GREP_IFELSE([/usr/include/linux/if_packet.h], [struct tpacket2_hdr ],
+ [AC_DEFINE([HAVE_TPACKET_V2], [1],
+ [Define to 1 if struct tpacket2_hdr is defined])])
+ OVS_GREP_IFELSE([/usr/include/linux/if_packet.h], [struct tpacket_hdr ],
+ [AC_DEFINE([HAVE_TPACKET_V1], [1],
+ [Define to 1 if struct tpacket_hdr is defined])])
+ fi
+])
+
dnl Checks for buggy strtok_r.
dnl
dnl Some versions of glibc 2.7 has a bug in strtok_r when compiling
diff --git a/configure.ac b/configure.ac
index 4f483fa..51c288b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -89,6 +89,7 @@ OVS_CHECK_VISUAL_STUDIO_DDK
OVS_CHECK_COVERAGE
OVS_CHECK_NDEBUG
OVS_CHECK_NETLINK
+OVS_CHECK_LINUX_TPACKET
OVS_CHECK_OPENSSL
OVS_CHECK_LIBCAPNG
OVS_CHECK_LOGDIR
diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
index 143616c..e8febfe 100644
--- a/lib/netdev-linux-private.h
+++ b/lib/netdev-linux-private.h
@@ -26,6 +26,9 @@
#include <linux/mii.h>
#include <stdint.h>
#include <stdbool.h>
+#ifdef HAVE_TPACKET
+#include <linux/if_packet.h>
+#endif
#include "dp-packet.h"
#include "netdev-afxdp.h"
@@ -40,6 +43,25 @@ struct netdev;
#define LINUX_RXQ_TSO_MAX_LEN 65536
+#ifdef HAVE_TPACKET
+struct tpacket_ring {
+ int sockfd;
+ struct iovec *rd;
+ uint8_t *mm_space;
+ size_t mm_len, rd_len;
+ struct sockaddr_ll ll;
+ int type, rd_num, flen, version;
+ union {
+ struct tpacket_req req;
+ struct tpacket_req3 req3;
+ };
+ uint32_t block_num;
+ uint32_t frame_num;
+ uint32_t frame_num_in_block;
+ void * ppd;
+};
+#endif /* HAVE_TPACKET */
+
struct netdev_rxq_linux {
struct netdev_rxq up;
bool is_tap;
@@ -103,6 +125,11 @@ struct netdev_linux {
int numa_id; /* NUMA node id. */
+#ifdef HAVE_TPACKET
+ struct tpacket_ring *tp_rx_ring;
+ struct tpacket_ring *tp_tx_ring;
+#endif
+
#ifdef HAVE_AF_XDP
/* AF_XDP information. */
struct xsk_socket_info **xsks;
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 6add3e2..b5becf3 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -48,6 +48,9 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#ifdef HAVE_TPACKET
+#include <sys/mman.h>
+#endif
#include "coverage.h"
#include "dp-packet.h"
@@ -153,6 +156,34 @@ struct tpacket_auxdata {
uint16_t tp_vlan_tpid;
};
+#ifdef HAVE_TPACKET /* All the definitions for TPACKET */
+#ifndef __aligned_tpacket
+# define __aligned_tpacket __attribute__((aligned(TPACKET_ALIGNMENT)))
+#endif
+
+#ifndef __align_tpacket
+# define __align_tpacket(x) __attribute__((aligned(TPACKET_ALIGN(x))))
+#endif
+
+struct block_desc {
+ uint32_t version;
+ uint32_t offset_to_priv;
+ struct tpacket_hdr_v1 h1;
+};
+
+union frame_map {
+ struct {
+ struct tpacket_hdr tp_h __aligned_tpacket;
+ struct sockaddr_ll s_ll __align_tpacket(sizeof(struct tpacket_hdr));
+ } *v1;
+ struct {
+ struct tpacket2_hdr tp_h __aligned_tpacket;
+ struct sockaddr_ll s_ll __align_tpacket(sizeof(struct tpacket2_hdr));
+ } *v2;
+ void *raw;
+};
+#endif /* HAVE_TPACKET */
+
/* Linux 2.6.27 introduced ethtool_cmd_speed
*
* To avoid revisiting problems reported with using configure to detect
@@ -1064,6 +1095,141 @@ netdev_linux_rxq_alloc(void)
return &rx->up;
}
+#ifdef HAVE_TPACKET
+static inline int
+tpacket_set_packet_loss_discard(int sock)
+{
+ int discard = 1;
+
+ return setsockopt(sock, SOL_PACKET, PACKET_LOSS, (void *) &discard,
+ sizeof(discard));
+}
+
+static inline void *
+tpacket_get_next_frame(struct tpacket_ring *ring, uint32_t frame_num)
+{
+#ifdef HAVE_TPACKET_V3
+ uint8_t *f0 = ring->rd[0].iov_base;
+
+ return f0 + (frame_num * ring->req3.tp_frame_size);
+#else
+ return ring->rd[frame_num].iov_base;
+#endif
+}
+
+/*
+ * For TPACKET_V1&V2, ring->rd_num is tp_frame_nr, ring->flen is tp_frame_size
+ */
+static inline void
+tpacket_v1_v2_fill_ring(struct tpacket_ring *ring, unsigned int blocks)
+{
+ ring->req.tp_block_size = getpagesize() << 2;
+ ring->req.tp_frame_size = TPACKET_ALIGNMENT << 7;
+ ring->req.tp_block_nr = blocks;
+
+ ring->req.tp_frame_nr = ring->req.tp_block_size /
+ ring->req.tp_frame_size *
+ ring->req.tp_block_nr;
+
+ ring->mm_len = ring->req.tp_block_size * ring->req.tp_block_nr;
+ ring->rd_num = ring->req.tp_frame_nr;
+ ring->flen = ring->req.tp_frame_size;
+}
+
+/*
+ * For TPACKET_V3, ring->rd_num is tp_block_nr, ring->flen is tp_block_size
+ */
+static inline void
+tpacket_v3_fill_ring(struct tpacket_ring *ring, unsigned int blocks, int type)
+{
+ if (type == PACKET_RX_RING) {
+ ring->req3.tp_retire_blk_tov = 0;
+ ring->req3.tp_sizeof_priv = 0;
+ ring->req3.tp_feature_req_word = 0;
+ }
+ ring->req3.tp_block_size = getpagesize() << 2;
+ ring->req3.tp_frame_size = TPACKET_ALIGNMENT << 7;
+ ring->req3.tp_block_nr = blocks;
+
+ ring->req3.tp_frame_nr = ring->req3.tp_block_size /
+ ring->req3.tp_frame_size *
+ ring->req3.tp_block_nr;
+
+ ring->mm_len = ring->req3.tp_block_size * ring->req3.tp_block_nr;
+ ring->rd_num = ring->req3.tp_block_nr;
+ ring->flen = ring->req3.tp_block_size;
+}
+
+static int
+tpacket_setup_ring(int sock, struct tpacket_ring *ring, int version, int type)
+{
+ int ret = 0;
+ unsigned int blocks = 256;
+
+ ring->type = type;
+ ring->version = version;
+
+ switch (version) {
+ case TPACKET_V1:
+ case TPACKET_V2:
+ if (type == PACKET_TX_RING) {
+ tpacket_set_packet_loss_discard(sock);
+ }
+ tpacket_v1_v2_fill_ring(ring, blocks);
+ ret = setsockopt(sock, SOL_PACKET, type, &ring->req,
+ sizeof(ring->req));
+ break;
+
+ case TPACKET_V3:
+ tpacket_v3_fill_ring(ring, blocks, type);
+ ret = setsockopt(sock, SOL_PACKET, type, &ring->req3,
+ sizeof(ring->req3));
+ break;
+ }
+
+ if (ret == -1) {
+ return -1;
+ }
+
+ ring->rd_len = ring->rd_num * sizeof(*ring->rd);
+ ring->rd = xmalloc(ring->rd_len);
+ if (ring->rd == NULL) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int
+tpacket_mmap_rx_tx_ring(int sock, struct tpacket_ring *rx_ring,
+ struct tpacket_ring *tx_ring)
+{
+ int i;
+
+ rx_ring->mm_space = mmap(0, rx_ring->mm_len + tx_ring->mm_len,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock, 0);
+ if (rx_ring->mm_space == MAP_FAILED) {
+ return -1;
+ }
+
+ memset(rx_ring->rd, 0, rx_ring->rd_len);
+ for (i = 0; i < rx_ring->rd_num; ++i) {
+ rx_ring->rd[i].iov_base = rx_ring->mm_space + (i * rx_ring->flen);
+ rx_ring->rd[i].iov_len = rx_ring->flen;
+ }
+
+ tx_ring->mm_space = rx_ring->mm_space + rx_ring->mm_len;
+ memset(tx_ring->rd, 0, tx_ring->rd_len);
+ for (i = 0; i < tx_ring->rd_num; ++i) {
+ tx_ring->rd[i].iov_base = tx_ring->mm_space + (i * tx_ring->flen);
+ tx_ring->rd[i].iov_len = tx_ring->flen;
+ }
+
+ return 0;
+}
+#endif
+
static int
netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
{
@@ -1079,6 +1245,15 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
} else {
struct sockaddr_ll sll;
int ifindex, val;
+#ifdef HAVE_TPACKET
+#ifdef HAVE_TPACKET_V3
+ int ver = TPACKET_V3;
+#elif defined(HAVE_TPACKET_V2)
+ int ver = TPACKET_V2;
+#else
+ int ver = TPACKET_V1;
+#endif
+#endif
/* Result of tcpdump -dd inbound */
static const struct sock_filter filt[] = {
{ 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
@@ -1091,13 +1266,52 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
};
/* Create file descriptor. */
- rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
+ rx->fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
if (rx->fd < 0) {
error = errno;
VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
goto error;
}
+#ifdef HAVE_TPACKET
+ error = setsockopt(rx->fd, SOL_PACKET, PACKET_VERSION, &ver,
+ sizeof(ver));
+ if (error != 0) {
+ error = errno;
+ VLOG_ERR("%s: failed to set tpacket version (%s)",
+ netdev_get_name(netdev_), ovs_strerror(error));
+ goto error;
+ }
+ netdev->tp_rx_ring = xzalloc(sizeof(struct tpacket_ring));
+ netdev->tp_tx_ring = xzalloc(sizeof(struct tpacket_ring));
+ netdev->tp_rx_ring->sockfd = rx->fd;
+ netdev->tp_tx_ring->sockfd = rx->fd;
+ error = tpacket_setup_ring(rx->fd, netdev->tp_rx_ring, ver,
+ PACKET_RX_RING);
+ if (error != 0) {
+ error = errno;
+ VLOG_ERR("%s: failed to set tpacket rx ring (%s)",
+ netdev_get_name(netdev_), ovs_strerror(error));
+ goto error;
+ }
+ error = tpacket_setup_ring(rx->fd, netdev->tp_tx_ring, ver,
+ PACKET_TX_RING);
+ if (error != 0) {
+ error = errno;
+ VLOG_ERR("%s: failed to set tpacket tx ring (%s)",
+ netdev_get_name(netdev_), ovs_strerror(error));
+ goto error;
+ }
+ error = tpacket_mmap_rx_tx_ring(rx->fd, netdev->tp_rx_ring,
+ netdev->tp_tx_ring);
+ if (error != 0) {
+ error = errno;
+ VLOG_ERR("%s: failed to mmap tpacket rx & tx ring (%s)",
+ netdev_get_name(netdev_), ovs_strerror(error));
+ goto error;
+ }
+#endif
+
val = 1;
if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
error = errno;
@@ -1129,7 +1343,12 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
/* Bind to specific ethernet device. */
memset(&sll, 0, sizeof sll);
- sll.sll_family = AF_PACKET;
+ sll.sll_family = PF_PACKET;
+#ifdef HAVE_TPACKET
+ sll.sll_hatype = 0;
+ sll.sll_pkttype = 0;
+ sll.sll_halen = 0;
+#endif
sll.sll_ifindex = ifindex;
sll.sll_protocol = htons(ETH_P_ALL);
if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
@@ -1168,6 +1387,17 @@ netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
int i;
if (!rx->is_tap) {
+#ifdef HAVE_TPACKET
+ struct netdev_linux *netdev = netdev_linux_cast(rx->up.netdev);
+
+ if (netdev->tp_rx_ring) {
+ munmap(netdev->tp_rx_ring->mm_space,
+ 2 * netdev->tp_rx_ring->mm_len);
+ free(netdev->tp_rx_ring->rd);
+ free(netdev->tp_tx_ring->rd);
+ }
+#endif
+
close(rx->fd);
}
@@ -1184,6 +1414,7 @@ netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
free(rx);
}
+#ifndef HAVE_TPACKET
static ovs_be16
auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
{
@@ -1345,6 +1576,7 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux
*rx, int mtu,
return 0;
}
+#else /* ifdef HAVE_TPACKET */
/*
* Receive packets from tap by batch process for better performance,
* it can receive NETDEV_MAX_BURST packets at most once, the received
@@ -1428,6 +1660,125 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux
*rx, int mtu,
return 0;
}
+static int
+netdev_linux_batch_recv_tpacket(struct netdev_rxq *rxq_, int mtu,
+ struct dp_packet_batch *batch)
+{
+ struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
+ struct netdev_linux *netdev = netdev_linux_cast(rx->up.netdev);
+ struct dp_packet *buffer;
+ int i = 0;
+
+#ifdef HAVE_TPACKET_V3
+ unsigned int block_num;
+ unsigned int fn_in_block;
+ struct block_desc *pbd;
+ struct tpacket3_hdr *ppd;
+
+ ppd = (struct tpacket3_hdr *)netdev->tp_rx_ring->ppd;
+ block_num = netdev->tp_rx_ring->block_num;
+ fn_in_block = netdev->tp_rx_ring->frame_num_in_block;
+ pbd = (struct block_desc *) netdev->tp_rx_ring->rd[block_num].iov_base;
+#else
+#if defined(HAVE_TPACKET_V2)
+ struct tpacket2_hdr *ppd;
+#else
+ struct tpacket_hdr *ppd;
+#endif
+ unsigned int frame_num;
+ unsigned int frame_nr = netdev->tp_rx_ring->rd_num;
+
+ frame_num = netdev->tp_rx_ring->frame_num;
+#endif
+
+ while (i < NETDEV_MAX_BURST) {
+#ifdef HAVE_TPACKET_V3
+ if ((pbd->h1.block_status & TP_STATUS_USER) == 0) {
+ break;
+ }
+ if (fn_in_block == 0) {
+ ppd = (struct tpacket3_hdr *) ((uint8_t *) pbd +
+ pbd->h1.offset_to_first_pkt);
+ }
+#elif defined(HAVE_TPACKET_V2)
+ ppd = (struct tpacket2_hdr *)
+ netdev->tp_rx_ring->rd[frame_num].iov_base;
+ if ((ppd->tp_status & TP_STATUS_USER) == 0) {
+ break;
+ }
+#else
+ ppd = (struct tpacket_hdr *)netdev->tp_rx_ring->rd[frame_num].iov_base;
+ if ((ppd->tp_status & TP_STATUS_USER) == 0) {
+ break;
+ }
+#endif
+
+ buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
+ DP_NETDEV_HEADROOM);
+ memcpy(dp_packet_data(buffer),
+ (uint8_t *) ppd + ppd->tp_mac, ppd->tp_snaplen);
+ dp_packet_set_size(buffer,
+ dp_packet_size(buffer) + ppd->tp_snaplen);
+#if defined(HAVE_TPACKET_V2) || defined(HAVE_TPACKET_V3)
+ if (ppd->tp_status & TP_STATUS_VLAN_VALID) {
+ struct eth_header *eth;
+ bool double_tagged;
+ ovs_be16 vlan_tpid;
+
+ eth = dp_packet_data(buffer);
+ double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
+ if (ppd->tp_status & TP_STATUS_VLAN_TPID_VALID) {
+#ifdef HAVE_TPACKET_V3
+ vlan_tpid = htons(ppd->hv1.tp_vlan_tpid);
+#else
+ vlan_tpid = htons(ppd->tp_vlan_tpid);
+#endif
+ } else if (double_tagged) {
+ vlan_tpid = htons(ETH_TYPE_VLAN_8021AD);
+ } else {
+ vlan_tpid = htons(ETH_TYPE_VLAN_8021Q);
+ }
+#ifdef HAVE_TPACKET_V3
+ eth_push_vlan(buffer, vlan_tpid, htons(ppd->hv1.tp_vlan_tci));
+#else
+ eth_push_vlan(buffer, vlan_tpid, htons(ppd->tp_vlan_tci));
+#endif
+ }
+#endif
+ dp_packet_batch_add(batch, buffer);
+
+#ifdef HAVE_TPACKET_V3
+ fn_in_block++;
+ if (fn_in_block >= pbd->h1.num_pkts) {
+ pbd->h1.block_status = TP_STATUS_KERNEL;
+ block_num = (block_num + 1) %
+ netdev->tp_rx_ring->req3.tp_block_nr;
+ pbd = (struct block_desc *)
+ netdev->tp_rx_ring->rd[block_num].iov_base;
+ fn_in_block = 0;
+ ppd = NULL;
+ } else {
+ ppd = (struct tpacket3_hdr *)
+ ((uint8_t *) ppd + ppd->tp_next_offset);
+ }
+#else
+ ppd->tp_status = TP_STATUS_KERNEL;
+ frame_num = (frame_num + 1) % frame_nr;
+#endif
+ i++;
+ }
+
+#ifdef HAVE_TPACKET_V3
+ netdev->tp_rx_ring->block_num = block_num;
+ netdev->tp_rx_ring->frame_num_in_block = fn_in_block;
+ netdev->tp_rx_ring->ppd = ppd;
+#else
+ netdev->tp_rx_ring->frame_num = frame_num;
+#endif
+
+ return 0;
+}
+#endif
static int
netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
@@ -1443,9 +1794,15 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct
dp_packet_batch *batch,
}
dp_packet_batch_init(batch);
- retval = (rx->is_tap
- ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
- : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch));
+ if (rx->is_tap) {
+ retval = netdev_linux_batch_rxq_recv_tap(rx, mtu, batch);
+ } else {
+#ifndef HAVE_TPACKET
+ retval = netdev_linux_batch_rxq_recv_sock(rx, mtu, batch);
+#else
+ retval = netdev_linux_batch_recv_tpacket(rxq_, mtu, batch);
+#endif
+ }
if (retval) {
if (retval != EAGAIN && retval != EMSGSIZE) {
@@ -1486,6 +1843,7 @@ netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
}
}
+#ifndef HAVE_TPACKET
static int
netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu,
struct dp_packet_batch *batch)
@@ -1531,6 +1889,7 @@ netdev_linux_sock_batch_send(int sock, int ifindex, bool
tso, int mtu,
return error;
}
+#else /* ifdef HAVE_TPACKET */
/* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
* essential, because packets sent to a tap device with an AF_PACKET socket
* will loop back to be *received* again on the tap device. This doesn't occur
@@ -1650,6 +2009,114 @@ netdev_linux_get_numa_id(const struct netdev *netdev_)
return numa_id;
}
+static inline int
+tpacket_tx_is_ready(void * next_frame)
+{
+#ifdef HAVE_TPACKE_V3
+ struct tpacket3_hdr *hdr = (struct tpacket3_hdr *)next_frame;
+#elif defined(HAVE_TPACKE_V2)
+ struct tpacket2_hdr *hdr = (struct tpacket2_hdr *)next_frame;
+#else
+ struct tpacket_hdr *hdr = (struct tpacket_hdr *)next_frame;
+#endif
+ return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING));
+}
+
+static int
+netdev_linux_tpacket_batch_send(struct netdev *netdev_,
+ struct dp_packet_batch *batch)
+{
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ struct dp_packet *packet;
+ int sockfd;
+ ssize_t bytes_sent;
+ int total_pkts = 0;
+
+#ifdef HAVE_TPACKET_V3
+ unsigned int frame_nr = netdev->tp_tx_ring->req3.tp_frame_nr;
+#else
+ unsigned int frame_nr = netdev->tp_tx_ring->rd_num;
+#endif
+ unsigned int frame_num = netdev->tp_tx_ring->frame_num;
+
+ /* The Linux tap driver returns EIO if the device is not up,
+ * so if the device is not up, don't waste time sending it.
+ * However, if the device is in another network namespace
+ * then OVS can't retrieve the state. In that case, send the
+ * packets anyway. */
+ if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
+ netdev->tx_dropped += dp_packet_batch_size(batch);
+ return 0;
+ }
+
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ union frame_map ppd;
+ size_t size = dp_packet_size(packet);
+#ifdef HAVE_TPACKET_V3
+ struct tpacket3_hdr *next_frame
+ = tpacket_get_next_frame(netdev->tp_tx_ring, frame_num);
+#elif defined(HAVE_TPACKET_V2)
+ struct tpacket2_hdr *next_frame
+ = tpacket_get_next_frame(netdev->tp_tx_ring, frame_num);
+#else
+ struct tpacket_hdr *next_frame
+ = tpacket_get_next_frame(netdev->tp_tx_ring, frame_num);
+#endif
+
+ ppd.raw = next_frame;
+ if (!tpacket_tx_is_ready(next_frame)) {
+ break;
+ }
+#ifdef HAVE_TPACKET_V3
+ next_frame->tp_snaplen = size;
+ next_frame->tp_len = size;
+ next_frame->tp_next_offset = 0;
+
+ memcpy((uint8_t *)ppd.raw + TPACKET3_HDRLEN
+ - sizeof(struct sockaddr_ll),
+ dp_packet_data(packet),
+ size);
+#elif defined(HAVE_TPACKET_V2)
+ ppd.v2->tp_h.tp_snaplen = size;
+ ppd.v2->tp_h.tp_len = size;
+
+ memcpy((uint8_t *)ppd.raw + TPACKET2_HDRLEN
+ - sizeof(struct sockaddr_ll),
+ dp_packet_data(packet),
+ size);
+#else
+ ppd.v1->tp_h.tp_snaplen = size;
+ ppd.v1->tp_h.tp_len = size;
+
+ memcpy((uint8_t *)ppd.raw + TPACKET_HDRLEN
+ - sizeof(struct sockaddr_ll),
+ dp_packet_data(packet),
+ size);
+#endif
+ next_frame->tp_status = TP_STATUS_SEND_REQUEST;
+ frame_num = (frame_num + 1) % frame_nr;
+ total_pkts++;
+ }
+ netdev->tp_tx_ring->frame_num = frame_num;
+
+ /* kick-off transmits */
+ if (total_pkts != 0) {
+ sockfd = netdev->tp_tx_ring->sockfd;
+ bytes_sent = sendto(sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+ if (bytes_sent == -1 &&
+ errno != ENOBUFS && errno != EAGAIN) {
+ /*
+ * In case of an ENOBUFS/EAGAIN error all of the enqueued
+ * packets will be considered successful even though only some
+ * are sent.
+ */
+ netdev->tx_dropped += dp_packet_batch_size(batch);
+ }
+ }
+ return 0;
+}
+#endif
+
/* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
* errno value. Returns EAGAIN without blocking if the packet cannot be queued
* immediately. Returns EMSGSIZE if a partial packet was transmitted or if
@@ -1689,7 +2156,11 @@ netdev_linux_send(struct netdev *netdev_, int qid
OVS_UNUSED,
goto free_batch;
}
+#ifndef HAVE_TPACKET
error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch);
+#else
+ error = netdev_linux_tpacket_batch_send(netdev_, batch);
+#endif
} else {
error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch);
}
--
1.8.3.1
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev