The patch creates a new netdev type "afxdp" and copies some of the afxdp api implementation from xdpsock_user.c at linux sample code. The afxdp ebpf programs/maps are loaded when dpif-netdev is created, and when users add a netdev with type="afxdp", ovs attaches the ebpf program/map to the netdev, and initializes the af_xdp socket.
Signed-off-by: William Tu <u9012...@gmail.com> --- lib/automake.mk | 5 +- lib/dp-packet.c | 20 ++ lib/dp-packet.h | 27 +- lib/dpif-netdev-perf.h | 16 +- lib/dpif-netdev.c | 59 +++- lib/if_xdp.h | 79 +++++ lib/netdev-dummy.c | 1 + lib/netdev-linux.c | 808 ++++++++++++++++++++++++++++++++++++++++++++++++- lib/netdev-provider.h | 2 + lib/netdev-vport.c | 1 + lib/netdev.c | 11 + lib/netdev.h | 1 + lib/xdpsock.c | 70 +++++ lib/xdpsock.h | 82 +++++ 14 files changed, 1163 insertions(+), 19 deletions(-) create mode 100644 lib/if_xdp.h create mode 100644 lib/xdpsock.c create mode 100644 lib/xdpsock.h diff --git a/lib/automake.mk b/lib/automake.mk index 61fef23152d3..edb50ba812fc 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -302,7 +302,10 @@ lib_libopenvswitch_la_SOURCES = \ lib/lldp/lldpd.c \ lib/lldp/lldpd.h \ lib/lldp/lldpd-structs.c \ - lib/lldp/lldpd-structs.h + lib/lldp/lldpd-structs.h \ + lib/if_xdp.h \ + lib/xdpsock.c \ + lib/xdpsock.h if WIN32 lib_libopenvswitch_la_SOURCES += \ diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 443c22504379..b78207e33eea 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -126,6 +126,13 @@ dp_packet_uninit(struct dp_packet *b) * created as a dp_packet */ free_dpdk_buf((struct dp_packet*) b); #endif + } else if (b->source == DPBUF_AFXDP) { + struct dp_packet_afxdp *xpacket; + + xpacket = dp_packet_cast_afxdp(b); + if (xpacket->freelist_head) + umem_elem_push(xpacket->freelist_head, dp_packet_base(b)); + return; } } } @@ -254,6 +261,18 @@ dp_packet_resize__(struct dp_packet *b, size_t new_headroom, size_t new_tailroom case DPBUF_STACK: OVS_NOT_REACHED(); + case DPBUF_AFXDP: + if (new_headroom == dp_packet_headroom(b)) { + new_base = xmalloc(new_allocated); + } else { + new_base = xmalloc(new_allocated); + dp_packet_copy__(b, new_base, new_headroom, new_tailroom); + free(dp_packet_base(b)); + } + b->source = DPBUF_MALLOC; + // put back to freelist + OVS_NOT_REACHED(); + break; case DPBUF_STUB: b->source = DPBUF_MALLOC; new_base = xmalloc(new_allocated); @@ -439,6 +458,7 @@ dp_packet_steal_data(struct dp_packet *b) { void *p; ovs_assert(b->source != DPBUF_DPDK); + ovs_assert(b->source != DPBUF_AFXDP); if (b->source == DPBUF_MALLOC && dp_packet_data(b) == dp_packet_base(b)) { p = dp_packet_data(b); diff --git a/lib/dp-packet.h b/lib/dp-packet.h index b4b721cec241..c78b4315457e 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -30,6 +30,7 @@ #include "packets.h" #include "util.h" #include "flow.h" +#include "xdpsock.h" #ifdef __cplusplus extern "C" { @@ -42,10 +43,10 @@ enum OVS_PACKED_ENUM dp_packet_source { DPBUF_DPDK, /* buffer data is from DPDK allocated memory. * ref to dp_packet_init_dpdk() in dp-packet.c. */ + DPBUF_AFXDP, }; #define DP_PACKET_CONTEXT_SIZE 64 - /* Buffer for holding packet data. A dp_packet is automatically reallocated * as necessary if it grows too large for the available memory. * By default the packet type is set to Ethernet (PT_ETH). @@ -80,6 +81,18 @@ struct dp_packet { }; }; +struct dp_packet_afxdp { + void *next; /* point to next elem */ + struct umem_elem_head *freelist_head; + struct dp_packet packet; +}; + +static struct dp_packet_afxdp *dp_packet_cast_afxdp(const struct dp_packet *d) +{ + ovs_assert(d->source == DPBUF_AFXDP); + return CONTAINER_OF(d, struct dp_packet_afxdp, packet); +} + static inline void *dp_packet_data(const struct dp_packet *); static inline void dp_packet_set_data(struct dp_packet *, void *); static inline void *dp_packet_base(const struct dp_packet *); @@ -174,7 +187,19 @@ dp_packet_delete(struct dp_packet *b) free_dpdk_buf((struct dp_packet*) b); return; } + if (b->source == DPBUF_AFXDP) { + struct dp_packet_afxdp *xpacket; + + /* if a packet is received from afxdp port, + * and tx to a non-afxdp port. Then we need to + * push the rx umem back here + */ + xpacket = dp_packet_cast_afxdp(b); + if (xpacket->freelist_head) + umem_elem_push(xpacket->freelist_head, dp_packet_base(b)); + return; + } dp_packet_uninit(b); free(b); } diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h index 5993c25bc105..2d6a31dd54bd 100644 --- a/lib/dpif-netdev-perf.h +++ b/lib/dpif-netdev-perf.h @@ -108,7 +108,21 @@ cycles_counter_update(struct pmd_perf_stats *s) #ifdef DPDK_NETDEV return s->last_tsc = rte_get_tsc_cycles(); #else - return s->last_tsc = 0; + /* enable this so afxdp netdev can get stats from + * ovs-appctl dpif-netdev/pmd-stats-show + */ + union { + uint64_t tsc_64; + struct { + uint32_t lo_32; + uint32_t hi_32; + }; + } tsc; + asm volatile("rdtsc" : + "=a" (tsc.lo_32), + "=d" (tsc.hi_32)); + + return s->last_tsc = tsc.tsc_64; #endif } diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index baff020fe3d0..504d3b8f1839 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -76,6 +76,11 @@ #include "unixctl.h" #include "util.h" +#include "bpf.h" +#include "netdev.h" +#include "openvswitch/thread.h" +#include <bpf/bpf.h> + VLOG_DEFINE_THIS_MODULE(dpif_netdev); #define FLOW_DUMP_MAX_BATCH 50 @@ -507,6 +512,12 @@ struct tx_port { struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST]; }; +static struct dp_bpf { + struct bpf_state bpf; + struct netdev *outport; /* Used for downcall. */ +} bpf_datapath; + + /* A set of properties for the current processing loop that is not directly * associated with the pmd thread itself, but with the packets being * processed or the short-term system configuration (for example, time). @@ -1121,6 +1132,8 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], static int dpif_netdev_init(void) { + int error; + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS, clear_aux = PMD_INFO_CLEAR_STATS, poll_aux = PMD_INFO_SHOW_RXQ; @@ -1137,6 +1150,15 @@ dpif_netdev_init(void) unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]", 0, 1, dpif_netdev_pmd_rebalance, NULL); + + if (ovsthread_once_start(&once)) { + error = bpf_get(&bpf_datapath.bpf, true); + if (error) { + VLOG_ERR("%s: Load BPF datapath failed", __func__); + } + } + ovsthread_once_done(&once); + return 0; } @@ -1504,7 +1526,26 @@ dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd) ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex); ovs_mutex_unlock(&pmd->cond_mutex); } - +/* +static bool output_to_local_stack(struct netdev *netdev) +{ + return !strcmp(netdev_get_type(netdev), "tap"); +} +*/ +static bool netdev_support_xdp(const char *devname) +{ + /* + struct netdev_linux *netdev_linux = netdev_linux_cast(netdev_linux); + if (netdev_linux->ifindex == 0) + return false; +*/ + if (!strstr(devname, "afxdp")) { + return false; + } else { + return true; + } +} +static int afxdp_idx; static int port_create(const char *devname, const char *type, odp_port_t port_no, struct dp_netdev_port **portp) @@ -1519,7 +1560,6 @@ port_create(const char *devname, const char *type, /* Open and validate network device. */ error = netdev_open(devname, type, &netdev); - VLOG_INFO("%s %s error %d", __func__, devname, error); if (error) { return error; } @@ -1538,6 +1578,20 @@ port_create(const char *devname, const char *type, goto out; } + if (!strcmp(type, "afxdp")) { + error = netdev_set_xdp(netdev, &bpf_datapath.bpf.afxdp[afxdp_idx]); + if (error) { + VLOG_WARN("%s XDP set failed", __func__); + goto out; + } + error = netdev_set_xskmap(netdev, bpf_datapath.bpf.xsks_map[afxdp_idx].fd); + if (error) { + VLOG_ERR("%s XSK map set error\n", __func__); + goto out; + } + afxdp_idx++; + } + port = xzalloc(sizeof *port); port->port_no = port_no; port->netdev = netdev; @@ -5254,6 +5308,7 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd, n_batches = 0; emc_processing(pmd, packets, keys, batches, &n_batches, md_is_valid, port_no); + if (!dp_packet_batch_is_empty(packets)) { /* Get ingress port from first packet's metadata. */ in_port = packets->packets[0]->md.in_port.odp_port; diff --git a/lib/if_xdp.h b/lib/if_xdp.h new file mode 100644 index 000000000000..2a8c5780166f --- /dev/null +++ b/lib/if_xdp.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * if_xdp: XDP socket user-space interface + * Copyright(c) 2018 Intel Corporation. + * + * Author(s): Björn Töpel <bjorn.to...@intel.com> + * Magnus Karlsson <magnus.karls...@intel.com> + */ + +#ifndef _LINUX_IF_XDP_H +#define _LINUX_IF_XDP_H + +#include <linux/types.h> +#include <stdbool.h> + +/* Options for the sxdp_flags field */ +#define XDP_SHARED_UMEM (1 << 0) +#define XDP_COPY (1 << 1) /* Force copy-mode */ +#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ + +struct sockaddr_xdp { + __u16 sxdp_family; + __u16 sxdp_flags; + __u32 sxdp_ifindex; + __u32 sxdp_queue_id; + __u32 sxdp_shared_umem_fd; +}; + +struct xdp_ring_offset { + __u64 producer; + __u64 consumer; + __u64 desc; +}; + +struct xdp_mmap_offsets { + struct xdp_ring_offset rx; + struct xdp_ring_offset tx; + struct xdp_ring_offset fr; /* Fill */ + struct xdp_ring_offset cr; /* Completion */ +}; + +/* XDP socket options */ +#define XDP_MMAP_OFFSETS 1 +#define XDP_RX_RING 2 +#define XDP_TX_RING 3 +#define XDP_UMEM_REG 4 +#define XDP_UMEM_FILL_RING 5 +#define XDP_UMEM_COMPLETION_RING 6 +#define XDP_STATISTICS 7 + +struct xdp_umem_reg { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 chunk_size; + __u32 headroom; +}; + +struct xdp_statistics { + __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ +}; + +/* Pgoff for mmaping the rings */ +#define XDP_PGOFF_RX_RING 0 +#define XDP_PGOFF_TX_RING 0x80000000 +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL + +/* Rx/Tx descriptor */ +struct xdp_desc { + __u64 addr; + __u32 len; + __u32 options; +}; + +/* UMEM descriptor is __u64 */ + +#endif /* _LINUX_IF_XDP_H */ diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index 44c9458a9a22..c7a065ed7ba8 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -1429,6 +1429,7 @@ netdev_dummy_update_flags(struct netdev *netdev_, NULL, /* set_policing */ \ NULL, /* set_filter */ \ NULL, /* set_xdp */ \ + NULL, /* set_xskmap */ \ NULL, /* get_qos_types */ \ NULL, /* get_qos_capabilities */ \ NULL, /* get_qos */ \ diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 121dd3bc738e..d2b3e35336fd 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -46,7 +46,7 @@ #include <string.h> #include <unistd.h> -#include <bpf/libbpf.h> /* linux/tools/bpf/libbpf.h */ +#include <bpf/bpf.h> /* linux/tools/bpf/libbpf.h */ #include "bpf.h" #include "coverage.h" @@ -88,6 +88,552 @@ COVERAGE_DEFINE(netdev_set_hwaddr); COVERAGE_DEFINE(netdev_get_ethtool); COVERAGE_DEFINE(netdev_set_ethtool); +#ifdef AFXDP_NETDEV +// ========================================================= +#ifndef SOL_XDP +#define SOL_XDP 283 +#endif +#ifndef AF_XDP +#define AF_XDP 44 +#endif +#ifndef PF_XDP +#define PF_XDP AF_XDP +#endif +#define DEBUG_HEXDUMP 0 + +typedef __u32 u32; +typedef uint64_t u64; + +#include "lib/xdpsock.h" + +#define AFXDP_MODE XDP_FLAGS_DRV_MODE // DRV_MODE or SKB_MODE + +static u32 opt_xdp_flags; // now alwyas set to SKB_MODE at bpf_set_link_xdp_fd +static u32 opt_xdp_bind_flags; + +struct xdp_uqueue { + u32 cached_prod; + u32 cached_cons; + u32 mask; + u32 size; + u32 *producer; + u32 *consumer; + struct xdp_desc *ring; + void *map; +}; + +struct xdpsock { + struct xdp_uqueue rx; + struct xdp_uqueue tx; + int sfd; + struct xdp_umem *umem; + u32 outstanding_tx; + unsigned long rx_npkts; + unsigned long tx_npkts; + unsigned long prev_rx_npkts; + unsigned long prev_tx_npkts; +}; + +#define MAX_SOCKS 4 + +#define barrier() __asm__ __volatile__("": : :"memory") +#define u_smp_rmb() barrier() +#define u_smp_wmb() barrier() +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +static const char pkt_data[] = + "\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00" + "\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14" + "\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b" + "\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa"; + +static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs) +{ + u32 entries = q->cached_prod - q->cached_cons; + + if (entries == 0) { + q->cached_prod = *q->producer; + entries = q->cached_prod - q->cached_cons; + } + + return (entries > ndescs) ? ndescs : entries; +} + +static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb) +{ + u32 free_entries = q->cached_cons - q->cached_prod; + + if (free_entries >= nb) + return free_entries; + + /* Refresh the local tail pointer */ + q->cached_cons = (*q->consumer + q->size) & q->mask; + + return q->cached_cons - q->cached_prod; +} + +static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq, + struct xdp_desc *d, + size_t nb) +{ + u32 i; + + if (umem_nb_free(fq, nb) < nb) { + VLOG_ERR("%s error\n", __func__); + return -ENOSPC; + } + + for (i = 0; i < nb; i++) { + u32 idx = fq->cached_prod++ & fq->mask; + + fq->ring[idx] = d[i].addr; + } + + u_smp_wmb(); + + *fq->producer = fq->cached_prod; + + return 0; +} + +static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, uint64_t *d, + size_t nb) +{ + u32 i; + + if (umem_nb_free(fq, nb) < nb) { + VLOG_ERR("%s error\n", __func__); + return -ENOSPC; + } + + for (i = 0; i < nb; i++) { + u32 idx = fq->cached_prod++ & fq->mask; + + fq->ring[idx] = d[i]; + } + + u_smp_wmb(); + + *fq->producer = fq->cached_prod; + + return 0; +} + +static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb) +{ + u32 entries = q->cached_prod - q->cached_cons; + + if (entries == 0) { + q->cached_prod = *q->producer; + entries = q->cached_prod - q->cached_cons; + } + + return (entries > nb) ? nb : entries; +} + +static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq, + uint64_t *d, size_t nb) +{ + u32 idx, i, entries = umem_nb_avail(cq, nb); + + u_smp_rmb(); + + for (i = 0; i < entries; i++) { + idx = cq->cached_cons++ & cq->mask; + d[i] = cq->ring[idx]; + } + + if (entries > 0) { + u_smp_wmb(); + + *cq->consumer = cq->cached_cons; + } + + return entries; +} + +static struct xdp_umem *xdp_umem_configure(int sfd) +{ + int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS; + struct xdp_mmap_offsets off; + struct xdp_umem_reg mr; + struct xdp_umem *umem; + socklen_t optlen; + void *bufs; + int i; + + umem = calloc(1, sizeof(*umem)); + ovs_assert(umem); + +#ifdef HUGETLB +#define ADDR (void *)(0x0UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#define PROTECTION (PROT_READ | PROT_WRITE) +/* mount -t hugetlbfs nodev /mnt + echo 100 > /proc/sys/vm/nr_hugepages +*/ + bufs = mmap(ADDR, NUM_FRAMES * FRAME_SIZE, PROTECTION, FLAGS, -1, 0); + if (bufs == MAP_FAILED) { + VLOG_FATAL("mmap hugetlb fails %s", ovs_strerror(errno)); + } +#else + ovs_assert(posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */ + NUM_FRAMES * FRAME_SIZE) == 0); +#endif + VLOG_INFO("%s shared umem from %p to %p", __func__, + bufs, (char*)bufs + NUM_FRAMES * FRAME_SIZE); + + mr.addr = (__u64)bufs; + mr.len = NUM_FRAMES * FRAME_SIZE; + mr.chunk_size = FRAME_SIZE; + mr.headroom = FRAME_HEADROOM; + + ovs_assert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0); + ovs_assert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size, + sizeof(int)) == 0); + ovs_assert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size, + sizeof(int)) == 0); + + optlen = sizeof(off); + ovs_assert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off, + &optlen) == 0); + + umem->fq.map = mmap(0, off.fr.desc + + FQ_NUM_DESCS * sizeof(u64), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_UMEM_PGOFF_FILL_RING); + ovs_assert(umem->fq.map != MAP_FAILED); + + umem->fq.mask = FQ_NUM_DESCS - 1; + umem->fq.size = FQ_NUM_DESCS; + umem->fq.producer = (void *)((char *)umem->fq.map + off.fr.producer); + umem->fq.consumer = (void *)((char *)umem->fq.map + off.fr.consumer); + umem->fq.ring = (void *)((char *)umem->fq.map + off.fr.desc); + umem->fq.cached_cons = FQ_NUM_DESCS; + + umem->cq.map = mmap(0, off.cr.desc + + CQ_NUM_DESCS * sizeof(u64), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_UMEM_PGOFF_COMPLETION_RING); + ovs_assert(umem->cq.map != MAP_FAILED); + + umem->cq.mask = CQ_NUM_DESCS - 1; + umem->cq.size = CQ_NUM_DESCS; + umem->cq.producer = (void *)((char *)umem->cq.map + off.cr.producer); + umem->cq.consumer = (void *)((char *)umem->cq.map + off.cr.consumer); + umem->cq.ring = (void *)((char *)umem->cq.map + off.cr.desc); + + umem->frames = bufs; + umem->fd = sfd; + umem->head.next = NULL; + umem->head.n = 0; + ovs_mutex_init(&umem->head.mutex); + + // initialize the umem->frame + for (i = NUM_FRAMES - 1; i >= 0; i--) { + struct umem_elem *elem; + + elem = (struct umem_elem *)((char *)umem->frames + i * FRAME_SIZE); + umem_elem_push(&umem->head, elem); + VLOG_INFO("umem push %p counter %d", elem, umem_elem_count(&umem->head)); + } + + // pre-setup the dp_packet + for (i = NUM_FRAMES - 1; i >= 0; i--) { + struct dp_packet_afxdp *xpacket; + struct dp_packet *packet; + char *base; + + base = (char *)umem->frames + i * FRAME_SIZE; + xpacket = (struct dp_packet_afxdp *)base; + xpacket->freelist_head = &umem->head; + + packet = &xpacket->packet; + packet->source = DPBUF_AFXDP; + dp_packet_use(packet, base + FRAME_HEADROOM, 1024); + } + VLOG_INFO("umem_elem umem %p head %p 1st %p", umem, &umem->head, umem->head.next); + + return umem; +} + +static struct xdpsock *xsk_configure(struct xdp_umem *umem, + int ifindex, int xdp_queue_id) +{ + struct sockaddr_xdp sxdp = {}; + struct xdp_mmap_offsets off; + int sfd, ndescs = NUM_DESCS; + struct xdpsock *xsk; + bool shared = false; + socklen_t optlen; + u64 i; + + opt_xdp_flags |= AFXDP_MODE; + opt_xdp_bind_flags |= XDP_COPY; + + sfd = socket(PF_XDP, SOCK_RAW, 0); + ovs_assert(sfd >= 0); + + xsk = calloc(1, sizeof(*xsk)); + ovs_assert(xsk); + + xsk->sfd = sfd; + xsk->outstanding_tx = 0; + + VLOG_DBG("enter: %s xsk fd %d", __func__, sfd); + if (!umem) { + shared = false; + xsk->umem = xdp_umem_configure(sfd); + } else { + xsk->umem = umem; + ovs_assert(0); + } + + ovs_assert(setsockopt(sfd, SOL_XDP, XDP_RX_RING, + &ndescs, sizeof(int)) == 0); + ovs_assert(setsockopt(sfd, SOL_XDP, XDP_TX_RING, + &ndescs, sizeof(int)) == 0); + optlen = sizeof(off); + ovs_assert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off, + &optlen) == 0); + + /* Rx */ + xsk->rx.map = mmap(NULL, + off.rx.desc + + NUM_DESCS * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_PGOFF_RX_RING); + ovs_assert(xsk->rx.map != MAP_FAILED); + +/* + if (!shared) { + for (i = 0; i < NUM_DESCS * FRAME_SIZE; i += FRAME_SIZE) + ovs_assert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1) + == 0); + } +*/ + for (i = 0; i < NUM_DESCS; i++) { + struct umem_elem *elem; + uint64_t desc[1]; + + elem = umem_elem_pop(&xsk->umem->head); + desc[0] = (uint64_t)((char *)elem - xsk->umem->frames); + VLOG_INFO("pop %p and fill in fq, counter %d", elem, umem_elem_count(&xsk->umem->head)); + umem_fill_to_kernel(&xsk->umem->fq, desc, 1); + } + + // FIXME: we also configure tx here + /* Tx */ + xsk->tx.map = mmap(NULL, + off.tx.desc + + NUM_DESCS * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_PGOFF_TX_RING); + ovs_assert(xsk->tx.map != MAP_FAILED); + + xsk->rx.mask = NUM_DESCS - 1; + xsk->rx.size = NUM_DESCS; + xsk->rx.producer = (void *)((char *)xsk->rx.map + off.rx.producer); + xsk->rx.consumer = (void *)((char *)xsk->rx.map + off.rx.consumer); + xsk->rx.ring = (void *)((char *)xsk->rx.map + off.rx.desc); + + xsk->tx.mask = NUM_DESCS - 1; + xsk->tx.size = NUM_DESCS; + xsk->tx.producer = (void *)((char *)xsk->tx.map + off.tx.producer); + xsk->tx.consumer = (void *)((char *)xsk->tx.map + off.tx.consumer); + xsk->tx.ring = (void *)((char *)xsk->tx.map + off.tx.desc); + xsk->tx.cached_cons = NUM_DESCS; + + /* XSK socket */ + sxdp.sxdp_family = PF_XDP; + sxdp.sxdp_ifindex = ifindex; + sxdp.sxdp_queue_id = xdp_queue_id; + + if (shared) { + sxdp.sxdp_flags = XDP_SHARED_UMEM; + sxdp.sxdp_shared_umem_fd = umem->fd; + } else { + sxdp.sxdp_flags = opt_xdp_bind_flags; + } + + if (bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp))) { + VLOG_FATAL("afxdp bind failed (%s)", ovs_strerror(errno)); + } + + return xsk; +} + +static inline int xq_deq(struct xdp_uqueue *uq, + struct xdp_desc *descs, + int ndescs) +{ + struct xdp_desc *r = uq->ring; + unsigned int idx; + int i, entries; + + entries = xq_nb_avail(uq, ndescs); + + u_smp_rmb(); + + for (i = 0; i < entries; i++) { + idx = uq->cached_cons++ & uq->mask; + descs[i] = r[idx]; + } + + if (entries > 0) { + u_smp_wmb(); + + *uq->consumer = uq->cached_cons; + } + return entries; +} + +static inline void *xq_get_data(struct xdpsock *xsk, u64 addr) +{ + return &xsk->umem->frames[addr]; +} + +static void OVS_UNUSED vlog_hex_dump(const void *buf, size_t count) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + ds_put_hex_dump(&ds, buf, count, 0, false); + VLOG_INFO("\n%s", ds_cstr(&ds)); + ds_destroy(&ds); +} + +static void kick_tx(int fd) +{ + int ret; + + ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0); + if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY) { + return; + } else { + VLOG_FATAL("sendto fails %s", ovs_strerror(errno)); + } +} + +static inline void complete_tx_l2fwd(struct xdpsock *xsk) +{ + u64 descs[BATCH_SIZE]; + unsigned int rcvd; + size_t ndescs; + + if (!xsk->outstanding_tx) + return; + + kick_tx(xsk->sfd); + ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE : + xsk->outstanding_tx; + + /* re-add completed Tx buffers */ + rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, ndescs); + + if (rcvd > 0) { + umem_fill_to_kernel(&xsk->umem->fq, descs, rcvd); + xsk->outstanding_tx -= rcvd; + xsk->tx_npkts += rcvd; + } +} + +static inline void complete_tx_only(struct xdpsock *xsk) +{ + u64 descs[BATCH_SIZE]; + unsigned int rcvd; + + if (!xsk->outstanding_tx) { + VLOG_DBG("no outstanding_tx"); + return; + } + + kick_tx(xsk->sfd); + + rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, BATCH_SIZE); + if (rcvd > 0) { + xsk->outstanding_tx -= rcvd; + xsk->tx_npkts += rcvd; + } +} + +static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs) +{ + u32 free_entries = q->cached_cons - q->cached_prod; + + if (free_entries >= ndescs) + return free_entries; + + /* Refresh the local tail pointer */ + q->cached_cons = *q->consumer + q->size; + return q->cached_cons - q->cached_prod; +} + +static inline int xq_enq(struct xdp_uqueue *uq, + const struct xdp_desc *descs, + unsigned int ndescs) +{ + struct xdp_desc *r = uq->ring; + unsigned int i; + + if (xq_nb_free(uq, ndescs) < ndescs) + return -ENOSPC; + + for (i = 0; i < ndescs; i++) { + u32 idx = uq->cached_prod++ & uq->mask; + + r[idx].addr = descs[i].addr; + r[idx].len = descs[i].len; + } + + u_smp_wmb(); + + *uq->producer = uq->cached_prod; + return 0; +} + +static inline int xq_enq_tx_only(struct xdp_uqueue *uq, + unsigned int id, unsigned int ndescs) +{ + struct xdp_desc *r = uq->ring; + unsigned int i; + + if (xq_nb_free(uq, ndescs) < ndescs) + return -ENOSPC; + + for (i = 0; i < ndescs; i++) { + u32 idx = uq->cached_prod++ & uq->mask; + + r[idx].addr = (id + i) << FRAME_SHIFT; + r[idx].len = sizeof(pkt_data) - 1; + } + + u_smp_wmb(); + + *uq->producer = uq->cached_prod; + return 0; +} + +static inline void print_xsk_stat(struct xdpsock *xsk OVS_UNUSED) { +#ifdef DEBUG + struct xdp_statistics stat; + socklen_t optlen; + + optlen = sizeof(stat); + ovs_assert(getsockopt(xsk->sfd, SOL_XDP, XDP_STATISTICS, + &stat, &optlen) == 0); + + VLOG_INFO("rx dropped %llu, rx_invalid %llu, tx_invalid %llu", + stat.rx_dropped, stat.rx_invalid_descs, stat.tx_invalid_descs); +#else + return; +#endif +} +// ========================================================= +#endif /* These were introduced in Linux 2.6.14, so they might be missing if we have * old headers. */ @@ -522,6 +1068,8 @@ struct netdev_linux { int tap_fd; bool present; /* If the device is present in the namespace */ uint64_t tx_dropped; /* tap device can drop if the iface is down */ + struct xdpsock *xsk[16]; /* af_xdp socket: each queue has one xdp sock */ + int xskmap_fd; /* map netdev's queue id to xsk fd */ }; struct netdev_rxq_linux { @@ -571,6 +1119,12 @@ is_netdev_linux_class(const struct netdev_class *netdev_class) } static bool +is_afxdp_netdev(const struct netdev *netdev) +{ + return netdev_get_class(netdev) == &netdev_afxdp_class; +} + +static bool is_tap_netdev(const struct netdev *netdev) { return netdev_get_class(netdev) == &netdev_tap_class; @@ -921,6 +1475,17 @@ netdev_linux_destruct(struct netdev *netdev_) atomic_count_dec(&miimon_cnt); } + if (is_afxdp_netdev(netdev_)) { + int ifindex; + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + + get_ifindex(netdev_, &ifindex); + bpf_set_link_xdp_fd(ifindex, -1, AFXDP_MODE); +#ifdef HUGETLB + munmap(netdev->xsk[0]->umem->frames, NUM_FRAMES * FRAME_SIZE); +#endif + } + ovs_mutex_destroy(&netdev->mutex); } @@ -950,6 +1515,43 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) rx->is_tap = is_tap_netdev(netdev_); if (rx->is_tap) { rx->fd = netdev->tap_fd; + } else if (is_afxdp_netdev(netdev_)) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int ifindex, num_socks = 0; + struct xdpsock *xsk; + int xdp_queue_id = 0; // FIXME: now use 1 queue only + int key = 0; + int xsk_fd; + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + VLOG_ERR("ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", + ovs_strerror(errno)); + ovs_assert(0); + } + + VLOG_INFO("%s: %s: queue=%d configuring xdp sock", + __func__, netdev_->name, xdp_queue_id); + + /* Get ethernet device index. */ + error = get_ifindex(&netdev->up, &ifindex); + if (error) { + goto error; + } + + xsk = xsk_configure(NULL, ifindex, xdp_queue_id); + + netdev->xsk[num_socks++] = xsk; + rx->fd = xsk->sfd; //for upper layer to poll + xsk_fd = xsk->sfd; + + if (xsk_fd) { + error = bpf_map_update_elem(netdev->xskmap_fd, &key, &xsk_fd, 0); + if (error) { + VLOG_ERR("failed to set xsks_map: %s", ovs_strerror(error)); + return error; + } + } + } else { struct sockaddr_ll sll; int ifindex, val; @@ -1149,6 +1751,57 @@ netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer) return 0; } +/* Receive packet from AF_XDP socket */ +static inline int +netdev_linux_rxq_xsk(struct xdpsock *xsk, + struct dp_packet_batch *batch) +{ + struct xdp_desc descs[NETDEV_MAX_BURST]; + unsigned int rcvd, i = 0; + int ret = 0; + + rcvd = xq_deq(&xsk->rx, descs, NETDEV_MAX_BURST); + if (rcvd == 0) { + return 0; + } + + for (i = 0; i < rcvd; i++) { + struct dp_packet_afxdp *xpacket; + struct dp_packet *packet; + void *base; + + base = xq_get_data(xsk, descs[i].addr); + xpacket = (struct dp_packet_afxdp *)((char *)base - FRAME_HEADROOM); + packet = &xpacket->packet; + xpacket->freelist_head = &xsk->umem->head; + + packet->source = DPBUF_AFXDP; + dp_packet_set_size(packet, descs[i].len); + dp_packet_batch_add(batch, packet); + } + + xsk->rx_npkts += rcvd; + + for (i = 0; i < rcvd; i++) { + struct umem_elem *elem; + struct xdp_desc descs[1]; + int retry_cnt = 0; +retry: + elem = umem_elem_pop(&xsk->umem->head); + if (!elem && retry_cnt < 10) { + retry_cnt++; + VLOG_WARN("retry refilling the fill queue"); + xsleep(1); + goto retry; + } + descs[0].addr = (uint64_t)((char *)elem - xsk->umem->frames); + umem_fill_to_kernel_ex(&xsk->umem->fq, descs, 1); + } + +// print_xsk_stat(xsk); + return ret; +} + static int netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch) { @@ -1157,17 +1810,25 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch) struct dp_packet *buffer; ssize_t retval; int mtu; + struct netdev_linux *netdev_ = netdev_linux_cast(netdev); - if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) { - mtu = ETH_PAYLOAD_MAX; - } + if (!is_afxdp_netdev(netdev)) { + if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) { + mtu = ETH_PAYLOAD_MAX; + } - /* Assume Ethernet port. No need to set packet_type. */ - buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu, + /* Assume Ethernet port. No need to set packet_type. */ + buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu, DP_NETDEV_HEADROOM); + } + + if (is_afxdp_netdev(netdev)) { + return netdev_linux_rxq_xsk(netdev_->xsk[0], batch); + } + retval = (rx->is_tap - ? netdev_linux_rxq_recv_tap(rx->fd, buffer) - : netdev_linux_rxq_recv_sock(rx->fd, buffer)); + ? netdev_linux_rxq_recv_tap(rx->fd, buffer) : + netdev_linux_rxq_recv_sock(rx->fd, buffer)); if (retval) { if (retval != EAGAIN && retval != EMSGSIZE) { @@ -1208,6 +1869,80 @@ netdev_linux_rxq_drain(struct netdev_rxq *rxq_) } static int +netdev_linux_afxdp_batch_send(struct xdpsock *xsk, /* send to xdp socket! */ + struct dp_packet_batch *batch) +{ + struct dp_packet *packet; + struct xdp_uqueue *uq; + struct xdp_desc *r; + int ndescs = batch->count; + u64 descs[BATCH_SIZE]; + unsigned int rcvd = 0, total_tx = 0; + int i; + + uq = &xsk->tx; + r = uq->ring; + + if (xq_nb_free(uq, ndescs) < ndescs) { + VLOG_ERR("no free desc, outstanding tx %d, free tx nb %d", + xsk->outstanding_tx, xq_nb_free(uq, ndescs)); + return -EAGAIN; + } + + DP_PACKET_BATCH_FOR_EACH (packet, batch) { + struct umem_elem *elem; + struct dp_packet_afxdp *xpacket; + + u32 idx = uq->cached_prod++ & uq->mask; + elem = umem_elem_pop(&xsk->umem->head); + if (!elem) { + VLOG_ERR("no available elem!"); + OVS_NOT_REACHED(); + } + + memcpy(elem, dp_packet_data(packet), dp_packet_size(packet)); + //vlog_hex_dump(dp_packet_data(packet), 14); + r[idx].addr = (uint64_t)((char *)elem - xsk->umem->frames); + r[idx].len = dp_packet_size(packet); + + if (packet->source == DPBUF_AFXDP) { + xpacket = dp_packet_cast_afxdp(packet); + umem_elem_push(xpacket->freelist_head, dp_packet_base(packet)); + xpacket->freelist_head = NULL; + } + } + + u_smp_wmb(); + + *uq->producer = uq->cached_prod; + xsk->outstanding_tx += batch->count; + +retry: + kick_tx(xsk->sfd); + + rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, BATCH_SIZE); + if (rcvd > 0) { + xsk->outstanding_tx -= rcvd; + xsk->tx_npkts += rcvd; + total_tx += rcvd; + } + + for (i = 0; i < rcvd; i++) { + struct umem_elem *elem; + + elem = (struct umem_elem *)(descs[i] + xsk->umem->frames); + umem_elem_push(&xsk->umem->head, elem); + } + + if (total_tx < batch->count) { + goto retry; + } + //print_xsk_stat(xsk); + + return 0; +} + +static int netdev_linux_sock_batch_send(int sock, int ifindex, struct dp_packet_batch *batch) { @@ -1312,7 +2047,8 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED, int error = 0; int sock = 0; - if (!is_tap_netdev(netdev_)) { + if (!is_tap_netdev(netdev_) && + !is_afxdp_netdev(netdev_)) { sock = af_packet_sock(); if (sock < 0) { error = -sock; @@ -1326,7 +2062,14 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED, } error = netdev_linux_sock_batch_send(sock, ifindex, batch); + } else if (is_afxdp_netdev(netdev_)) { + struct xdpsock *xsk; + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + + xsk = netdev->xsk[0]; // FIXME: always use queue 0 + error = netdev_linux_afxdp_batch_send(xsk, batch); } else { + VLOG_INFO_RL(&rl, "%s sent to tap dev", __func__); error = netdev_linux_tap_batch_send(netdev_, batch); } if (error) { @@ -2426,12 +3169,22 @@ netdev_linux_set_xdp__(struct netdev *netdev_, const struct bpf_prog *prog, { struct netdev_linux *netdev = netdev_linux_cast(netdev_); const char *netdev_name = netdev_get_name(netdev_); - int ifindex = netdev->ifindex; + int ifindex; int error; - VLOG_DBG("Setting %s XDP filter %d on %s (ifindex %d)", prog->name, + error = get_ifindex(netdev_, &ifindex); + if (error) { + return ENODEV; + } + + + VLOG_INFO("Setting %s XDP filter %d on %s (ifindex %d)", prog->name, prog->fd, netdev_name, ifindex); + if (ifindex == 0) { + VLOG_WARN("skip device %s", netdev_name); + return 0; + } if (netdev->cache_valid & valid_bit) { error = *filter_error; if (error || (prog && prog->fd == *netdev_filter)) { @@ -2440,7 +3193,7 @@ netdev_linux_set_xdp__(struct netdev *netdev_, const struct bpf_prog *prog, } netdev->cache_valid &= ~valid_bit; } - error = bpf_set_link_xdp_fd(ifindex, prog->fd, XDP_FLAGS_SKB_MODE); + error = bpf_set_link_xdp_fd(ifindex, prog->fd, AFXDP_MODE); if (error < 0) { VLOG_WARN_RL(&rl, "%s: adding XDP filter %s failed: %s", netdev_name, prog->name, ovs_strerror(error)); @@ -2456,6 +3209,19 @@ out: } static int +netdev_linux_set_xskmap(struct netdev *netdev_, int xskmap_fd) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + + ovs_assert(xskmap_fd != 0); + + VLOG_INFO("%s xsks_map fd %d", __func__, xskmap_fd); + netdev->xskmap_fd = xskmap_fd; + + return 0; +} + +static int netdev_linux_set_xdp(struct netdev *netdev_, const struct bpf_prog *prog) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); @@ -3122,12 +3888,12 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off, return error; } -#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \ +#define NETDEV_LINUX_CLASS(NAME, PMD, CONSTRUCT, GET_STATS, \ GET_FEATURES, GET_STATUS, \ FLOW_OFFLOAD_API) \ { \ NAME, \ - false, /* is_pmd */ \ + PMD, /* is_pmd */ \ \ NULL, \ netdev_linux_run, \ @@ -3167,6 +3933,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off, netdev_linux_set_policing, \ netdev_linux_set_filter, \ netdev_linux_set_xdp, \ + netdev_linux_set_xskmap, \ netdev_linux_get_qos_types, \ netdev_linux_get_qos_capabilities, \ netdev_linux_get_qos, \ @@ -3201,9 +3968,20 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off, FLOW_OFFLOAD_API \ } +const struct netdev_class netdev_afxdp_class = + NETDEV_LINUX_CLASS( + "afxdp", + true, + netdev_linux_construct, + netdev_linux_get_stats, + netdev_linux_get_features, + netdev_linux_get_status, + LINUX_FLOW_OFFLOAD_API); + const struct netdev_class netdev_linux_class = NETDEV_LINUX_CLASS( "system", + false, netdev_linux_construct, netdev_linux_get_stats, netdev_linux_get_features, @@ -3213,6 +3991,7 @@ const struct netdev_class netdev_linux_class = const struct netdev_class netdev_tap_class = NETDEV_LINUX_CLASS( "tap", + false, netdev_linux_construct_tap, netdev_tap_get_stats, netdev_linux_get_features, @@ -3222,6 +4001,7 @@ const struct netdev_class netdev_tap_class = const struct netdev_class netdev_internal_class = NETDEV_LINUX_CLASS( "internal", + false, netdev_linux_construct, netdev_internal_get_stats, NULL, /* get_features */ diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h index 3e53a5b76272..df92275d5aff 100644 --- a/lib/netdev-provider.h +++ b/lib/netdev-provider.h @@ -515,6 +515,7 @@ struct netdev_class { * * This function may be set to null if filters are not supported. */ int (*set_xdp)(struct netdev *netdev, const struct bpf_prog *); + int (*set_xskmap)(struct netdev *netdev, int xsks_map_fd); /* Adds to 'types' all of the forms of QoS supported by 'netdev', or leaves * it empty if 'netdev' does not support QoS. Any names added to 'types' @@ -884,6 +885,7 @@ extern const struct netdev_class netdev_bsd_class; extern const struct netdev_class netdev_windows_class; #else extern const struct netdev_class netdev_linux_class; +extern const struct netdev_class netdev_afxdp_class; #endif extern const struct netdev_class netdev_internal_class; extern const struct netdev_class netdev_tap_class; diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index 4341c89894a3..547a52c27853 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -1057,6 +1057,7 @@ netdev_vport_set_xdp(struct netdev *netdev_, const struct bpf_prog *prog) NULL, /* set_policing */ \ netdev_vport_set_filter, /* set_filter */ \ netdev_vport_set_xdp, /* set_xdp */ \ + NULL, /* set_xskmap */ \ NULL, /* get_qos_types */ \ NULL, /* get_qos_capabilities */ \ NULL, /* get_qos */ \ diff --git a/lib/netdev.c b/lib/netdev.c index c44a1a683b92..826555dd92f6 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -142,6 +142,7 @@ netdev_initialize(void) #ifdef __linux__ netdev_register_provider(&netdev_linux_class); + netdev_register_provider(&netdev_afxdp_class); netdev_register_provider(&netdev_internal_class); netdev_register_provider(&netdev_tap_class); netdev_vport_tunnel_register(); @@ -1474,6 +1475,16 @@ netdev_set_xdp(struct netdev *netdev, struct bpf_prog *prog) : EOPNOTSUPP); } +/* set xsk map */ +int +netdev_set_xskmap(struct netdev *netdev, int xskmap) +{ + return (netdev->netdev_class->set_xskmap + ? netdev->netdev_class->set_xskmap(netdev, xskmap) + : EOPNOTSUPP); +} + + /* Adds to 'types' all of the forms of QoS supported by 'netdev', or leaves it * empty if 'netdev' does not support QoS. Any names added to 'types' should * be documented as valid for the "type" column in the "QoS" table in diff --git a/lib/netdev.h b/lib/netdev.h index 3388504d85c9..3a8d7118378e 100644 --- a/lib/netdev.h +++ b/lib/netdev.h @@ -320,6 +320,7 @@ int netdev_set_policing(struct netdev *, uint32_t kbits_rate, uint32_t kbits_burst); int netdev_set_filter(struct netdev *netdev, struct bpf_prog *prog); int netdev_set_xdp(struct netdev *netdev, struct bpf_prog *prog); +int netdev_set_xskmap(struct netdev *netdev, int xsks_map_fd); int netdev_get_qos_types(const struct netdev *, struct sset *types); int netdev_get_qos_capabilities(const struct netdev *, diff --git a/lib/xdpsock.c b/lib/xdpsock.c new file mode 100644 index 000000000000..088f21e035c5 --- /dev/null +++ b/lib/xdpsock.c @@ -0,0 +1,70 @@ +#include <config.h> +#include "openvswitch/vlog.h" +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <syslog.h> +#include <time.h> +#include <unistd.h> +#include "async-append.h" +#include "coverage.h" +#include "dirs.h" +#include "openvswitch/dynamic-string.h" +#include "openvswitch/ofpbuf.h" +#include "ovs-thread.h" +#include "sat-math.h" +#include "socket-util.h" +#include "svec.h" +#include "syslog-direct.h" +#include "syslog-libc.h" +#include "syslog-provider.h" +#include "timeval.h" +#include "unixctl.h" +#include "util.h" +#include "ovs-atomic.h" +#include "xdpsock.h" + +inline void +umem_elem_push(struct umem_elem_head *head, + struct umem_elem *elem) +{ + struct umem_elem *next; + + ovs_mutex_lock(&head->mutex); + next = head->next; + head->next = elem; + elem->next = next; + head->n++; + ovs_mutex_unlock(&head->mutex); +} + +inline struct umem_elem * +umem_elem_pop(struct umem_elem_head *head) +{ + struct umem_elem *next, *new_head; + + ovs_mutex_lock(&head->mutex); + next = head->next; + if (!next) { + ovs_mutex_unlock(&head->mutex); + return NULL; + } + new_head = next->next; + head->next = new_head; + head->n--; + ovs_mutex_unlock(&head->mutex); + return next; +} + +inline unsigned int +umem_elem_count(struct umem_elem_head *head) +{ + return head->n; +} + + diff --git a/lib/xdpsock.h b/lib/xdpsock.h new file mode 100644 index 000000000000..f916b726f863 --- /dev/null +++ b/lib/xdpsock.h @@ -0,0 +1,82 @@ +#ifndef XDPSOCK_H +#define XDPSOCK_H 1 + +#include <errno.h> +#include <getopt.h> +#include <libgen.h> +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <linux/if_xdp.h> +#include <linux/if_ether.h> +#include <net/if.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <net/ethernet.h> +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/mman.h> +#include <time.h> +#include <unistd.h> +#include <pthread.h> +#include <locale.h> +#include <sys/types.h> +#include <poll.h> + +#include "ovs-atomic.h" +#include "openvswitch/thread.h" + +#define FRAME_HEADROOM 1024 +#define FRAME_SHIFT 11 +#define FRAME_SIZE 2048 +#define BATCH_SIZE NETDEV_MAX_BURST + +//#define DEBUG +#ifdef DEBUG +#define NUM_FRAMES 1024 +#define NUM_DESCS 64 +#define FQ_NUM_DESCS 32 +#define CQ_NUM_DESCS 32 +#else +#define NUM_FRAMES 10240 +#define NUM_DESCS 1024 +#define FQ_NUM_DESCS 1024 +#define CQ_NUM_DESCS 1024 +#endif + +struct umem_elem_head { + struct umem_elem *next; + struct ovs_mutex mutex; + uint32_t n; +}; + +struct umem_elem { + struct umem_elem *next; +}; + +struct xdp_umem_uqueue { + uint32_t cached_prod; + uint32_t cached_cons; + uint32_t mask; + uint32_t size; + uint32_t *producer; + uint32_t *consumer; + uint64_t *ring; + void *map; +}; + +struct xdp_umem { + struct umem_elem_head head; /* a list to keep free frame */ + char *frames; + struct xdp_umem_uqueue fq; + struct xdp_umem_uqueue cq; + int fd; +}; + +void umem_elem_push(struct umem_elem_head *head, + struct umem_elem *elem); +struct umem_elem *umem_elem_pop(struct umem_elem_head *head); +unsigned int umem_elem_count(struct umem_elem_head *head); +#endif -- 2.7.4 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev