On Mon, Jan 20, 2020 at 11:50 PM <[email protected]> wrote: > > From: Yi Yang <[email protected]> > > We can avoid high system call overhead by using TPACKET_V1/V2/V3 > and use DPDK-like poll to receive and send packets (Note: send > still needs to call sendto to trigger final packet transmission). > > I can see about 30% improvement compared to last recvmmsg > optimization if I use TPACKET_V3. TPACKET_V1/V2 is worse than > TPACKET_V3, but it still can improve about 20%. > > For veth, it is 1.47 Gbps before this patch, it is about 1.98 > Gbps after applied this patch. But it is about 4.00 Gbps if we > use af_packet for veth, the bottle neck lies in ovs-vswitchd
Hi Yiyang, I don't understand these three numbers. Don't you also use af_packet for veth for 1.47 Gbps and 1.98 Gbps? What's the difference between your 4.00 Gbps and 1.98Gbps? William > thread, it will handle too many things for every loop (as below) > , so it can't work very efficintly as pmd_thread. > > memory_run(); > bridge_run(); > unixctl_server_run(unixctl); > netdev_run(); > > memory_wait(); > bridge_wait(); > unixctl_server_wait(unixctl); > netdev_wait(); > poll_block(); > > In the next step, it will be better if let pmd_thread to handle > tap and veth interface. > > Signed-off-by: Yi Yang <[email protected]> > Co-authored-by: William Tu <[email protected]> > Signed-off-by: William Tu <[email protected]> > --- > acinclude.m4 | 23 +++ > configure.ac | 1 + > lib/netdev-linux-private.h | 27 +++ > lib/netdev-linux.c | 481 > ++++++++++++++++++++++++++++++++++++++++++++- > 4 files changed, 527 insertions(+), 5 deletions(-) > > diff --git a/acinclude.m4 b/acinclude.m4 > index c1470cc..e99aff1 100644 > --- a/acinclude.m4 > +++ b/acinclude.m4 > @@ -1095,6 +1095,29 @@ AC_DEFUN([OVS_CHECK_IF_DL], > AC_SEARCH_LIBS([pcap_open_live], [pcap]) > fi]) > > +dnl OVS_CHECK_LINUX_TPACKET > +dnl > +dnl Configure Linux TPACKET. > +AC_DEFUN([OVS_CHECK_LINUX_TPACKET], [ > + AC_CHECK_HEADER([linux/if_packet.h], > + [HAVE_TPACKET=yes], > + [HAVE_TPACKET=no]) > + AM_CONDITIONAL([HAVE_TPACKET], [test "$HAVE_TPACKET" = yes]) > + if test "$HAVE_TPACKET" = yes; then > + AC_DEFINE([HAVE_TPACKET], [1], > + [Define to 1 if linux/if_packet.h is available.]) > + OVS_GREP_IFELSE([/usr/include/linux/if_packet.h], [struct tpacket3_hdr ], > + [AC_DEFINE([HAVE_TPACKET_V3], [1], > + [Define to 1 if struct tpacket3_hdr is defined])]) > + OVS_GREP_IFELSE([/usr/include/linux/if_packet.h], [struct tpacket2_hdr ], > + [AC_DEFINE([HAVE_TPACKET_V2], [1], > + [Define to 1 if struct tpacket2_hdr is defined])]) > + OVS_GREP_IFELSE([/usr/include/linux/if_packet.h], [struct tpacket_hdr ], > + [AC_DEFINE([HAVE_TPACKET_V1], [1], > + [Define to 1 if struct tpacket_hdr is defined])]) > + fi > +]) > + > dnl Checks for buggy strtok_r. > dnl > dnl Some versions of glibc 2.7 has a bug in strtok_r when compiling > diff --git a/configure.ac b/configure.ac > index 4f483fa..51c288b 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -89,6 +89,7 @@ OVS_CHECK_VISUAL_STUDIO_DDK > OVS_CHECK_COVERAGE > OVS_CHECK_NDEBUG > OVS_CHECK_NETLINK > +OVS_CHECK_LINUX_TPACKET > OVS_CHECK_OPENSSL > OVS_CHECK_LIBCAPNG > OVS_CHECK_LOGDIR > diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h > index 143616c..e8febfe 100644 > --- a/lib/netdev-linux-private.h > +++ b/lib/netdev-linux-private.h > @@ -26,6 +26,9 @@ > #include <linux/mii.h> > #include <stdint.h> > #include <stdbool.h> > +#ifdef HAVE_TPACKET > +#include <linux/if_packet.h> > +#endif > > #include "dp-packet.h" > #include "netdev-afxdp.h" > @@ -40,6 +43,25 @@ struct netdev; > > #define LINUX_RXQ_TSO_MAX_LEN 65536 > > +#ifdef HAVE_TPACKET > +struct tpacket_ring { > + int sockfd; > + struct iovec *rd; > + uint8_t *mm_space; > + size_t mm_len, rd_len; > + struct sockaddr_ll ll; > + int type, rd_num, flen, version; > + union { > + struct tpacket_req req; > + struct tpacket_req3 req3; > + }; > + uint32_t block_num; > + uint32_t frame_num; > + uint32_t frame_num_in_block; > + void * ppd; > +}; > +#endif /* HAVE_TPACKET */ > + > struct netdev_rxq_linux { > struct netdev_rxq up; > bool is_tap; > @@ -103,6 +125,11 @@ struct netdev_linux { > > int numa_id; /* NUMA node id. */ > > +#ifdef HAVE_TPACKET > + struct tpacket_ring *tp_rx_ring; > + struct tpacket_ring *tp_tx_ring; > +#endif > + > #ifdef HAVE_AF_XDP > /* AF_XDP information. */ > struct xsk_socket_info **xsks; > diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c > index 6add3e2..b5becf3 100644 > --- a/lib/netdev-linux.c > +++ b/lib/netdev-linux.c > @@ -48,6 +48,9 @@ > #include <stdlib.h> > #include <string.h> > #include <unistd.h> > +#ifdef HAVE_TPACKET > +#include <sys/mman.h> > +#endif > > #include "coverage.h" > #include "dp-packet.h" > @@ -153,6 +156,34 @@ struct tpacket_auxdata { > uint16_t tp_vlan_tpid; > }; > > +#ifdef HAVE_TPACKET /* All the definitions for TPACKET */ > +#ifndef __aligned_tpacket > +# define __aligned_tpacket __attribute__((aligned(TPACKET_ALIGNMENT))) > +#endif > + > +#ifndef __align_tpacket > +# define __align_tpacket(x) __attribute__((aligned(TPACKET_ALIGN(x)))) > +#endif > + > +struct block_desc { > + uint32_t version; > + uint32_t offset_to_priv; > + struct tpacket_hdr_v1 h1; > +}; > + > +union frame_map { > + struct { > + struct tpacket_hdr tp_h __aligned_tpacket; > + struct sockaddr_ll s_ll __align_tpacket(sizeof(struct tpacket_hdr)); > + } *v1; > + struct { > + struct tpacket2_hdr tp_h __aligned_tpacket; > + struct sockaddr_ll s_ll __align_tpacket(sizeof(struct tpacket2_hdr)); > + } *v2; > + void *raw; > +}; > +#endif /* HAVE_TPACKET */ > + > /* Linux 2.6.27 introduced ethtool_cmd_speed > * > * To avoid revisiting problems reported with using configure to detect > @@ -1064,6 +1095,141 @@ netdev_linux_rxq_alloc(void) > return &rx->up; > } > > +#ifdef HAVE_TPACKET > +static inline int > +tpacket_set_packet_loss_discard(int sock) > +{ > + int discard = 1; > + > + return setsockopt(sock, SOL_PACKET, PACKET_LOSS, (void *) &discard, > + sizeof(discard)); > +} > + > +static inline void * > +tpacket_get_next_frame(struct tpacket_ring *ring, uint32_t frame_num) > +{ > +#ifdef HAVE_TPACKET_V3 > + uint8_t *f0 = ring->rd[0].iov_base; > + > + return f0 + (frame_num * ring->req3.tp_frame_size); > +#else > + return ring->rd[frame_num].iov_base; > +#endif > +} > + > +/* > + * For TPACKET_V1&V2, ring->rd_num is tp_frame_nr, ring->flen is > tp_frame_size > + */ > +static inline void > +tpacket_v1_v2_fill_ring(struct tpacket_ring *ring, unsigned int blocks) > +{ > + ring->req.tp_block_size = getpagesize() << 2; > + ring->req.tp_frame_size = TPACKET_ALIGNMENT << 7; > + ring->req.tp_block_nr = blocks; > + > + ring->req.tp_frame_nr = ring->req.tp_block_size / > + ring->req.tp_frame_size * > + ring->req.tp_block_nr; > + > + ring->mm_len = ring->req.tp_block_size * ring->req.tp_block_nr; > + ring->rd_num = ring->req.tp_frame_nr; > + ring->flen = ring->req.tp_frame_size; > +} > + > +/* > + * For TPACKET_V3, ring->rd_num is tp_block_nr, ring->flen is tp_block_size > + */ > +static inline void > +tpacket_v3_fill_ring(struct tpacket_ring *ring, unsigned int blocks, int > type) > +{ > + if (type == PACKET_RX_RING) { > + ring->req3.tp_retire_blk_tov = 0; > + ring->req3.tp_sizeof_priv = 0; > + ring->req3.tp_feature_req_word = 0; > + } > + ring->req3.tp_block_size = getpagesize() << 2; > + ring->req3.tp_frame_size = TPACKET_ALIGNMENT << 7; > + ring->req3.tp_block_nr = blocks; > + > + ring->req3.tp_frame_nr = ring->req3.tp_block_size / > + ring->req3.tp_frame_size * > + ring->req3.tp_block_nr; > + > + ring->mm_len = ring->req3.tp_block_size * ring->req3.tp_block_nr; > + ring->rd_num = ring->req3.tp_block_nr; > + ring->flen = ring->req3.tp_block_size; > +} > + > +static int > +tpacket_setup_ring(int sock, struct tpacket_ring *ring, int version, int > type) > +{ > + int ret = 0; > + unsigned int blocks = 256; > + > + ring->type = type; > + ring->version = version; > + > + switch (version) { > + case TPACKET_V1: > + case TPACKET_V2: > + if (type == PACKET_TX_RING) { > + tpacket_set_packet_loss_discard(sock); > + } > + tpacket_v1_v2_fill_ring(ring, blocks); > + ret = setsockopt(sock, SOL_PACKET, type, &ring->req, > + sizeof(ring->req)); > + break; > + > + case TPACKET_V3: > + tpacket_v3_fill_ring(ring, blocks, type); > + ret = setsockopt(sock, SOL_PACKET, type, &ring->req3, > + sizeof(ring->req3)); > + break; > + } > + > + if (ret == -1) { > + return -1; > + } > + > + ring->rd_len = ring->rd_num * sizeof(*ring->rd); > + ring->rd = xmalloc(ring->rd_len); > + if (ring->rd == NULL) { > + return -1; > + } > + > + return 0; > +} > + > +static inline int > +tpacket_mmap_rx_tx_ring(int sock, struct tpacket_ring *rx_ring, > + struct tpacket_ring *tx_ring) > +{ > + int i; > + > + rx_ring->mm_space = mmap(0, rx_ring->mm_len + tx_ring->mm_len, > + PROT_READ | PROT_WRITE, > + MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock, 0); > + if (rx_ring->mm_space == MAP_FAILED) { > + return -1; > + } > + > + memset(rx_ring->rd, 0, rx_ring->rd_len); > + for (i = 0; i < rx_ring->rd_num; ++i) { > + rx_ring->rd[i].iov_base = rx_ring->mm_space + (i * > rx_ring->flen); > + rx_ring->rd[i].iov_len = rx_ring->flen; > + } > + > + tx_ring->mm_space = rx_ring->mm_space + rx_ring->mm_len; > + memset(tx_ring->rd, 0, tx_ring->rd_len); > + for (i = 0; i < tx_ring->rd_num; ++i) { > + tx_ring->rd[i].iov_base = tx_ring->mm_space + (i * > tx_ring->flen); > + tx_ring->rd[i].iov_len = tx_ring->flen; > + } > + > + return 0; > +} > +#endif > + > static int > netdev_linux_rxq_construct(struct netdev_rxq *rxq_) > { > @@ -1079,6 +1245,15 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) > } else { > struct sockaddr_ll sll; > int ifindex, val; > +#ifdef HAVE_TPACKET > +#ifdef HAVE_TPACKET_V3 > + int ver = TPACKET_V3; > +#elif defined(HAVE_TPACKET_V2) > + int ver = TPACKET_V2; > +#else > + int ver = TPACKET_V1; > +#endif > +#endif > /* Result of tcpdump -dd inbound */ > static const struct sock_filter filt[] = { > { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */ > @@ -1091,13 +1266,52 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) > }; > > /* Create file descriptor. */ > - rx->fd = socket(PF_PACKET, SOCK_RAW, 0); > + rx->fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); > if (rx->fd < 0) { > error = errno; > VLOG_ERR("failed to create raw socket (%s)", > ovs_strerror(error)); > goto error; > } > > +#ifdef HAVE_TPACKET > + error = setsockopt(rx->fd, SOL_PACKET, PACKET_VERSION, &ver, > + sizeof(ver)); > + if (error != 0) { > + error = errno; > + VLOG_ERR("%s: failed to set tpacket version (%s)", > + netdev_get_name(netdev_), ovs_strerror(error)); > + goto error; > + } > + netdev->tp_rx_ring = xzalloc(sizeof(struct tpacket_ring)); > + netdev->tp_tx_ring = xzalloc(sizeof(struct tpacket_ring)); > + netdev->tp_rx_ring->sockfd = rx->fd; > + netdev->tp_tx_ring->sockfd = rx->fd; > + error = tpacket_setup_ring(rx->fd, netdev->tp_rx_ring, ver, > + PACKET_RX_RING); > + if (error != 0) { > + error = errno; > + VLOG_ERR("%s: failed to set tpacket rx ring (%s)", > + netdev_get_name(netdev_), ovs_strerror(error)); > + goto error; > + } > + error = tpacket_setup_ring(rx->fd, netdev->tp_tx_ring, ver, > + PACKET_TX_RING); > + if (error != 0) { > + error = errno; > + VLOG_ERR("%s: failed to set tpacket tx ring (%s)", > + netdev_get_name(netdev_), ovs_strerror(error)); > + goto error; > + } > + error = tpacket_mmap_rx_tx_ring(rx->fd, netdev->tp_rx_ring, > + netdev->tp_tx_ring); > + if (error != 0) { > + error = errno; > + VLOG_ERR("%s: failed to mmap tpacket rx & tx ring (%s)", > + netdev_get_name(netdev_), ovs_strerror(error)); > + goto error; > + } > +#endif > + > val = 1; > if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof > val)) { > error = errno; > @@ -1129,7 +1343,12 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) > > /* Bind to specific ethernet device. */ > memset(&sll, 0, sizeof sll); > - sll.sll_family = AF_PACKET; > + sll.sll_family = PF_PACKET; > +#ifdef HAVE_TPACKET > + sll.sll_hatype = 0; > + sll.sll_pkttype = 0; > + sll.sll_halen = 0; > +#endif > sll.sll_ifindex = ifindex; > sll.sll_protocol = htons(ETH_P_ALL); > if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) { > @@ -1168,6 +1387,17 @@ netdev_linux_rxq_destruct(struct netdev_rxq *rxq_) > int i; > > if (!rx->is_tap) { > +#ifdef HAVE_TPACKET > + struct netdev_linux *netdev = netdev_linux_cast(rx->up.netdev); > + > + if (netdev->tp_rx_ring) { > + munmap(netdev->tp_rx_ring->mm_space, > + 2 * netdev->tp_rx_ring->mm_len); > + free(netdev->tp_rx_ring->rd); > + free(netdev->tp_tx_ring->rd); > + } > +#endif > + > close(rx->fd); > } > > @@ -1184,6 +1414,7 @@ netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_) > free(rx); > } > > +#ifndef HAVE_TPACKET > static ovs_be16 > auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged) > { > @@ -1345,6 +1576,7 @@ netdev_linux_batch_rxq_recv_sock(struct > netdev_rxq_linux *rx, int mtu, > return 0; > } > > +#else /* ifdef HAVE_TPACKET */ > /* > * Receive packets from tap by batch process for better performance, > * it can receive NETDEV_MAX_BURST packets at most once, the received > @@ -1428,6 +1660,125 @@ netdev_linux_batch_rxq_recv_tap(struct > netdev_rxq_linux *rx, int mtu, > > return 0; > } > +static int > +netdev_linux_batch_recv_tpacket(struct netdev_rxq *rxq_, int mtu, > + struct dp_packet_batch *batch) > +{ > + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); > + struct netdev_linux *netdev = netdev_linux_cast(rx->up.netdev); > + struct dp_packet *buffer; > + int i = 0; > + > +#ifdef HAVE_TPACKET_V3 > + unsigned int block_num; > + unsigned int fn_in_block; > + struct block_desc *pbd; > + struct tpacket3_hdr *ppd; > + > + ppd = (struct tpacket3_hdr *)netdev->tp_rx_ring->ppd; > + block_num = netdev->tp_rx_ring->block_num; > + fn_in_block = netdev->tp_rx_ring->frame_num_in_block; > + pbd = (struct block_desc *) netdev->tp_rx_ring->rd[block_num].iov_base; > +#else > +#if defined(HAVE_TPACKET_V2) > + struct tpacket2_hdr *ppd; > +#else > + struct tpacket_hdr *ppd; > +#endif > + unsigned int frame_num; > + unsigned int frame_nr = netdev->tp_rx_ring->rd_num; > + > + frame_num = netdev->tp_rx_ring->frame_num; > +#endif > + > + while (i < NETDEV_MAX_BURST) { > +#ifdef HAVE_TPACKET_V3 > + if ((pbd->h1.block_status & TP_STATUS_USER) == 0) { > + break; > + } > + if (fn_in_block == 0) { > + ppd = (struct tpacket3_hdr *) ((uint8_t *) pbd + > + pbd->h1.offset_to_first_pkt); > + } > +#elif defined(HAVE_TPACKET_V2) > + ppd = (struct tpacket2_hdr *) > + netdev->tp_rx_ring->rd[frame_num].iov_base; > + if ((ppd->tp_status & TP_STATUS_USER) == 0) { > + break; > + } > +#else > + ppd = (struct tpacket_hdr > *)netdev->tp_rx_ring->rd[frame_num].iov_base; > + if ((ppd->tp_status & TP_STATUS_USER) == 0) { > + break; > + } > +#endif > + > + buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu, > + DP_NETDEV_HEADROOM); > + memcpy(dp_packet_data(buffer), > + (uint8_t *) ppd + ppd->tp_mac, ppd->tp_snaplen); > + dp_packet_set_size(buffer, > + dp_packet_size(buffer) + ppd->tp_snaplen); > +#if defined(HAVE_TPACKET_V2) || defined(HAVE_TPACKET_V3) > + if (ppd->tp_status & TP_STATUS_VLAN_VALID) { > + struct eth_header *eth; > + bool double_tagged; > + ovs_be16 vlan_tpid; > + > + eth = dp_packet_data(buffer); > + double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q); > + if (ppd->tp_status & TP_STATUS_VLAN_TPID_VALID) { > +#ifdef HAVE_TPACKET_V3 > + vlan_tpid = htons(ppd->hv1.tp_vlan_tpid); > +#else > + vlan_tpid = htons(ppd->tp_vlan_tpid); > +#endif > + } else if (double_tagged) { > + vlan_tpid = htons(ETH_TYPE_VLAN_8021AD); > + } else { > + vlan_tpid = htons(ETH_TYPE_VLAN_8021Q); > + } > +#ifdef HAVE_TPACKET_V3 > + eth_push_vlan(buffer, vlan_tpid, htons(ppd->hv1.tp_vlan_tci)); > +#else > + eth_push_vlan(buffer, vlan_tpid, htons(ppd->tp_vlan_tci)); > +#endif > + } > +#endif > + dp_packet_batch_add(batch, buffer); > + > +#ifdef HAVE_TPACKET_V3 > + fn_in_block++; > + if (fn_in_block >= pbd->h1.num_pkts) { > + pbd->h1.block_status = TP_STATUS_KERNEL; > + block_num = (block_num + 1) % > + netdev->tp_rx_ring->req3.tp_block_nr; > + pbd = (struct block_desc *) > + netdev->tp_rx_ring->rd[block_num].iov_base; > + fn_in_block = 0; > + ppd = NULL; > + } else { > + ppd = (struct tpacket3_hdr *) > + ((uint8_t *) ppd + ppd->tp_next_offset); > + } > +#else > + ppd->tp_status = TP_STATUS_KERNEL; > + frame_num = (frame_num + 1) % frame_nr; > +#endif > + i++; > + } > + > +#ifdef HAVE_TPACKET_V3 > + netdev->tp_rx_ring->block_num = block_num; > + netdev->tp_rx_ring->frame_num_in_block = fn_in_block; > + netdev->tp_rx_ring->ppd = ppd; > +#else > + netdev->tp_rx_ring->frame_num = frame_num; > +#endif > + > + return 0; > +} > +#endif > > static int > netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, > @@ -1443,9 +1794,15 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct > dp_packet_batch *batch, > } > > dp_packet_batch_init(batch); > - retval = (rx->is_tap > - ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch) > - : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch)); > + if (rx->is_tap) { > + retval = netdev_linux_batch_rxq_recv_tap(rx, mtu, batch); > + } else { > +#ifndef HAVE_TPACKET > + retval = netdev_linux_batch_rxq_recv_sock(rx, mtu, batch); > +#else > + retval = netdev_linux_batch_recv_tpacket(rxq_, mtu, batch); > +#endif > + } > > if (retval) { > if (retval != EAGAIN && retval != EMSGSIZE) { > @@ -1486,6 +1843,7 @@ netdev_linux_rxq_drain(struct netdev_rxq *rxq_) > } > } > > +#ifndef HAVE_TPACKET > static int > netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu, > struct dp_packet_batch *batch) > @@ -1531,6 +1889,7 @@ netdev_linux_sock_batch_send(int sock, int ifindex, > bool tso, int mtu, > return error; > } > > +#else /* ifdef HAVE_TPACKET */ > /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd > is > * essential, because packets sent to a tap device with an AF_PACKET socket > * will loop back to be *received* again on the tap device. This doesn't > occur > @@ -1650,6 +2009,114 @@ netdev_linux_get_numa_id(const struct netdev *netdev_) > return numa_id; > } > > +static inline int > +tpacket_tx_is_ready(void * next_frame) > +{ > +#ifdef HAVE_TPACKE_V3 > + struct tpacket3_hdr *hdr = (struct tpacket3_hdr *)next_frame; > +#elif defined(HAVE_TPACKE_V2) > + struct tpacket2_hdr *hdr = (struct tpacket2_hdr *)next_frame; > +#else > + struct tpacket_hdr *hdr = (struct tpacket_hdr *)next_frame; > +#endif > + return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)); > +} > + > +static int > +netdev_linux_tpacket_batch_send(struct netdev *netdev_, > + struct dp_packet_batch *batch) > +{ > + struct netdev_linux *netdev = netdev_linux_cast(netdev_); > + struct dp_packet *packet; > + int sockfd; > + ssize_t bytes_sent; > + int total_pkts = 0; > + > +#ifdef HAVE_TPACKET_V3 > + unsigned int frame_nr = netdev->tp_tx_ring->req3.tp_frame_nr; > +#else > + unsigned int frame_nr = netdev->tp_tx_ring->rd_num; > +#endif > + unsigned int frame_num = netdev->tp_tx_ring->frame_num; > + > + /* The Linux tap driver returns EIO if the device is not up, > + * so if the device is not up, don't waste time sending it. > + * However, if the device is in another network namespace > + * then OVS can't retrieve the state. In that case, send the > + * packets anyway. */ > + if (netdev->present && !(netdev->ifi_flags & IFF_UP)) { > + netdev->tx_dropped += dp_packet_batch_size(batch); > + return 0; > + } > + > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > + union frame_map ppd; > + size_t size = dp_packet_size(packet); > +#ifdef HAVE_TPACKET_V3 > + struct tpacket3_hdr *next_frame > + = tpacket_get_next_frame(netdev->tp_tx_ring, frame_num); > +#elif defined(HAVE_TPACKET_V2) > + struct tpacket2_hdr *next_frame > + = tpacket_get_next_frame(netdev->tp_tx_ring, frame_num); > +#else > + struct tpacket_hdr *next_frame > + = tpacket_get_next_frame(netdev->tp_tx_ring, frame_num); > +#endif > + > + ppd.raw = next_frame; > + if (!tpacket_tx_is_ready(next_frame)) { > + break; > + } > +#ifdef HAVE_TPACKET_V3 > + next_frame->tp_snaplen = size; > + next_frame->tp_len = size; > + next_frame->tp_next_offset = 0; > + > + memcpy((uint8_t *)ppd.raw + TPACKET3_HDRLEN > + - sizeof(struct sockaddr_ll), > + dp_packet_data(packet), > + size); > +#elif defined(HAVE_TPACKET_V2) > + ppd.v2->tp_h.tp_snaplen = size; > + ppd.v2->tp_h.tp_len = size; > + > + memcpy((uint8_t *)ppd.raw + TPACKET2_HDRLEN > + - sizeof(struct sockaddr_ll), > + dp_packet_data(packet), > + size); > +#else > + ppd.v1->tp_h.tp_snaplen = size; > + ppd.v1->tp_h.tp_len = size; > + > + memcpy((uint8_t *)ppd.raw + TPACKET_HDRLEN > + - sizeof(struct sockaddr_ll), > + dp_packet_data(packet), > + size); > +#endif > + next_frame->tp_status = TP_STATUS_SEND_REQUEST; > + frame_num = (frame_num + 1) % frame_nr; > + total_pkts++; > + } > + netdev->tp_tx_ring->frame_num = frame_num; > + > + /* kick-off transmits */ > + if (total_pkts != 0) { > + sockfd = netdev->tp_tx_ring->sockfd; > + bytes_sent = sendto(sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0); > + if (bytes_sent == -1 && > + errno != ENOBUFS && errno != EAGAIN) { > + /* > + * In case of an ENOBUFS/EAGAIN error all of the enqueued > + * packets will be considered successful even though only some > + * are sent. > + */ > + netdev->tx_dropped += dp_packet_batch_size(batch); > + } > + } > + return 0; > +} > +#endif > + > /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive > * errno value. Returns EAGAIN without blocking if the packet cannot be > queued > * immediately. Returns EMSGSIZE if a partial packet was transmitted or if > @@ -1689,7 +2156,11 @@ netdev_linux_send(struct netdev *netdev_, int qid > OVS_UNUSED, > goto free_batch; > } > > +#ifndef HAVE_TPACKET > error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch); > +#else > + error = netdev_linux_tpacket_batch_send(netdev_, batch); > +#endif > } else { > error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch); > } > -- > 1.8.3.1 > > > _______________________________________________ > dev mailing list > [email protected] > https://mail.openvswitch.org/mailman/listinfo/ovs-dev _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
