Hi Ilya,

Thanks for your review.

On Thu, May 30, 2019 at 8:57 AM Ilya Maximets <[email protected]> wrote:
>
> On 28.05.2019 22:01, William Tu wrote:
> > The patch introduces experimental AF_XDP support for OVS netdev.
> > diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h
> > index 859c05613ddf..a33b9a7353ba 100644
> > --- a/lib/dpif-netdev-perf.h
> > +++ b/lib/dpif-netdev-perf.h
> > @@ -21,6 +21,7 @@
> >  #include <stddef.h>
> >  #include <stdint.h>
> >  #include <string.h>
> > +#include <time.h>
> >  #include <math.h>
> >
> >  #ifdef DPDK_NETDEV
> > @@ -186,6 +187,24 @@ struct pmd_perf_stats {
> >      char *log_reason;
> >  };
> >
> > +#ifdef HAVE_AF_XDP
>
> I'd like to change this to "#ifdef __linux__".
> 'clock_gettime' is posix compliant, but CLOCK_MONOTONIC_RAW is
> Linux specific.

Yes, thanks, will do it.

>
> > +static inline uint64_t
> > +rdtsc_syscall(struct pmd_perf_stats *s)
> > +{
> > +    struct timespec val;
> > +    uint64_t v;
> > +
> > +    if (clock_gettime(CLOCK_MONOTONIC_RAW, &val) != 0) {
> > +       return s->last_tsc = 0;
>
> Maybe it's better to just return the value and allow caller to assign?

Do you mean just:
       return s->last_tsc;

> This way you'll not need to pass any arguments here.

I don't understand, I still need to pass &val, right?

>
> > +    }
> > +
> > +    v  = (uint64_t) val.tv_sec * 1000000000LL;
> > +    v += (uint64_t) val.tv_nsec;
> > +
> > +    return s->last_tsc = v;
> > +}
> > +#endif
> > +
> >  /* Support for accurate timing of PMD execution on TSC clock cycle level.
> >   * These functions are intended to be invoked in the context of pmd 
> > threads. */
> >
> > @@ -198,6 +217,15 @@ cycles_counter_update(struct pmd_perf_stats *s)
> >  {
> >  #ifdef DPDK_NETDEV
> >      return s->last_tsc = rte_get_tsc_cycles();
> > +#elif defined(HAVE_AF_XDP) && defined(__x86_64__)
>
> And this should be:
> #elif !defined(_MSC_VER) && defined(__x86_64__)
>
> Visual Studio doesn't support inline assembly this way.
> Other things are portable until we're on x86_64.

right, thanks!

>
> > +    /* This is x86-specific instructions. */
> > +    uint32_t h, l;
> > +    asm volatile("rdtsc" : "=a" (l), "=d" (h));
> > +
> > +    return s->last_tsc = ((uint64_t) h << 32) | l;
> > +#elif defined(HAVE_AF_XDP)
>
> #elif defined(__linux__)
>
> > +    /* non-x86_64 architecture uses syscall */
> > +    return rdtsc_syscall(s);
> >  #else
> >      return s->last_tsc = 0;
> >  #endif
> > diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> > new file mode 100644

<snip>
> > +
> > +static void
> > +xsk_destroy_all(struct netdev *netdev)
> > +{
> > +    struct netdev_linux *dev = netdev_linux_cast(netdev);
> > +    int i, ifindex;
> > +
> > +    ifindex = linux_get_ifindex(netdev_get_name(netdev));
> > +
> > +    for (i = 0; i < MAX_XSKQ; i++) {
> > +        if (dev->xsk[i]) {
> > +            VLOG_INFO("destroy xsk[%d]", i);
> > +            xsk_destroy(dev->xsk[i]);
> > +            dev->xsk[i] = NULL;
> > +            dev->xsk[i]->rx_dropped = 0;
> > +            dev->xsk[i]->tx_dropped = 0;
>
> Dereferencing of a just assigned NULL poiner. Something is definitely
> wrong here.

oh, thanks a lot.
>
> > +        }
> > +    }
> > +    VLOG_INFO("remove xdp program");
> > +    xsk_remove_xdp_program(ifindex, dev->xdpmode);
> > +}
> > +
> > +static inline void OVS_UNUSED
> > +log_xsk_stat(struct xsk_socket_info *xsk OVS_UNUSED) {
> > +    struct xdp_statistics stat;
> > +    socklen_t optlen;
> > +
> > +    optlen = sizeof stat;
> > +    ovs_assert(getsockopt(xsk_socket__fd(xsk->xsk), SOL_XDP, 
> > XDP_STATISTICS,
> > +               &stat, &optlen) == 0);
> > +
> > +    VLOG_DBG_RL(&rl, "rx dropped %llu, rx_invalid %llu, tx_invalid %llu",
> > +                stat.rx_dropped,
> > +                stat.rx_invalid_descs,
> > +                stat.tx_invalid_descs);
> > +}
> > +
> > +int
> > +netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args,
> > +                        char **errp OVS_UNUSED)
> > +{
> > +    struct netdev_linux *dev = netdev_linux_cast(netdev);
> > +    const char *str_xdpmode;
> > +    int xdpmode, new_n_rxq;
> > +
> > +    ovs_mutex_lock(&dev->mutex);
> > +    new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
> > +    if (new_n_rxq > MAX_XSKQ) {
> > +        ovs_mutex_unlock(&dev->mutex);
> > +        VLOG_ERR("%s: Too big 'n_rxq' (%d > %d).",
> > +                 netdev_get_name(netdev), new_n_rxq, MAX_XSKQ);
> > +        return EINVAL;
> > +    }
> > +
> > +    str_xdpmode = smap_get_def(args, "xdpmode", "skb");
> > +    if (!strcasecmp(str_xdpmode, "drv")) {
> > +        xdpmode = XDP_ZEROCOPY;
> > +    } else if (!strcasecmp(str_xdpmode, "skb")) {
> > +        xdpmode = XDP_COPY;
> > +    } else {
> > +        VLOG_ERR("%s: Incorrect xdpmode (%s).",
> > +                 netdev_get_name(netdev), str_xdpmode);
> > +        ovs_mutex_unlock(&dev->mutex);
> > +        return EINVAL;
> > +    }
> > +
> > +    if (dev->requested_n_rxq != new_n_rxq
> > +        || dev->requested_xdpmode != xdpmode) {
> > +        dev->requested_n_rxq = new_n_rxq;
> > +        dev->requested_xdpmode = xdpmode;
> > +        netdev_request_reconfigure(netdev);
> > +    }
> > +    ovs_mutex_unlock(&dev->mutex);
> > +    return 0;
> > +}
> > +
> > +int
> > +netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args)
> > +{
> > +    struct netdev_linux *dev = netdev_linux_cast(netdev);
> > +
> > +    ovs_mutex_lock(&dev->mutex);
> > +    smap_add_format(args, "n_rxq", "%d", netdev->n_rxq);
> > +    smap_add_format(args, "xdpmode", "%s",
> > +        dev->xdp_bind_flags == XDP_ZEROCOPY ? "drv" : "skb");
> > +    ovs_mutex_unlock(&dev->mutex);
> > +    return 0;
> > +}
> > +
> > +int
> > +netdev_afxdp_reconfigure(struct netdev *netdev)
> > +{
> > +    struct netdev_linux *dev = netdev_linux_cast(netdev);
> > +    struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
> > +    int err = 0;
> > +
> > +    ovs_mutex_lock(&dev->mutex);
> > +
> > +    if (netdev->n_rxq == dev->requested_n_rxq
> > +        && dev->xdpmode == dev->requested_xdpmode) {
> > +        goto out;
> > +    }
> > +
> > +    xsk_destroy_all(netdev);
> > +    netdev->n_rxq = dev->requested_n_rxq;
> > +
> > +    if (dev->requested_xdpmode == XDP_ZEROCOPY) {
> > +        VLOG_INFO("AF_XDP device %s in DRV mode", netdev_get_name(netdev));
> > +        /* From SKB mode to DRV mode */
> > +        dev->xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_DRV_MODE;
> > +        dev->xdp_bind_flags = XDP_ZEROCOPY;
> > +        dev->xdpmode = XDP_ZEROCOPY;
> > +
> > +        if (setrlimit(RLIMIT_MEMLOCK, &r)) {
> > +            VLOG_ERR("ERROR: setrlimit(RLIMIT_MEMLOCK): %s",
> > +                      ovs_strerror(errno));
> > +        }
> > +    } else {
> > +        VLOG_INFO("AF_XDP device %s in SKB mode", netdev_get_name(netdev));
> > +        /* From DRV mode to SKB mode */
> > +        dev->xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_SKB_MODE;
> > +        dev->xdp_bind_flags = XDP_COPY;
> > +        dev->xdpmode = XDP_COPY;
> > +        /* TODO: set rlimit back to previous value
> > +         * when no device is in DRV mode.
> > +         */
> > +    }
> > +
> > +    err = xsk_configure_all(netdev);
> > +    if (err) {
> > +        VLOG_ERR("AF_XDP device %s reconfig fails", 
> > netdev_get_name(netdev));
> > +    }
> > +    netdev_change_seq_changed(netdev);
> > +out:
> > +    ovs_mutex_unlock(&dev->mutex);
> > +    return err;
> > +}
> > +
> > +int
> > +netdev_afxdp_get_numa_id(const struct netdev *netdev)
> > +{
> > +    /* FIXME: Get netdev's PCIe device ID, then find
> > +     * its NUMA node id.
> > +     */
> > +    VLOG_INFO("FIXME: Device %s always use numa id 0",
> > +              netdev_get_name(netdev));
> > +    return 0;
> > +}
> > +
> > +static void
> > +xsk_remove_xdp_program(uint32_t ifindex, int xdpmode)
> > +{
> > +    uint32_t curr_prog_id = 0;
> > +    uint32_t flags;
> > +
> > +    /* remove_xdp_program() */
> > +    if (xdpmode == XDP_COPY) {
> > +        flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_SKB_MODE;
> > +    } else {
> > +        flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_DRV_MODE;
> > +    }
> > +
> > +    if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, flags)) {
> > +        bpf_set_link_xdp_fd(ifindex, -1, flags);
> > +    }
> > +    if (prog_id == curr_prog_id) {
> > +        bpf_set_link_xdp_fd(ifindex, -1, flags);
> > +    } else if (!curr_prog_id) {
> > +        VLOG_INFO("couldn't find a prog id on a given interface");
> > +    } else {
> > +        VLOG_INFO("program on interface changed, not removing");
> > +    }
> > +}
> > +
> > +void
> > +signal_remove_xdp(struct netdev *netdev)
> > +{> +    struct netdev_linux *dev = netdev_linux_cast(netdev);
> > +    int ifindex;
> > +
> > +    ifindex = linux_get_ifindex(netdev_get_name(netdev));
> > +
> > +    VLOG_WARN("force remove xdp program");
> > +    xsk_remove_xdp_program(ifindex, dev->xdpmode);
> > +}
> > +
> > +static struct dp_packet_afxdp *
> > +dp_packet_cast_afxdp(const struct dp_packet *d)
> > +{
> > +    ovs_assert(d->source == DPBUF_AFXDP);
> > +    return CONTAINER_OF(d, struct dp_packet_afxdp, packet);
> > +}
> > +
> > +void
> > +free_afxdp_buf(struct dp_packet *p)
> > +{
> > +    struct dp_packet_afxdp *xpacket;
> > +    unsigned long addr;
> > +
> > +    xpacket = dp_packet_cast_afxdp(p);
> > +    if (xpacket->mpool) {
> > +        void *base = dp_packet_base(p);
> > +
> > +        addr = (unsigned long)base & (~FRAME_SHIFT_MASK);
> > +        umem_elem_push(xpacket->mpool, (void *)addr);
> > +    }
> > +}
> > +
> > +static void
> > +free_afxdp_buf_batch(struct dp_packet_batch *batch)
> > +{
> > +    struct dp_packet_afxdp *xpacket = NULL;
> > +    struct dp_packet *packet;
> > +    void *elems[BATCH_SIZE];
> > +    unsigned long addr;
> > +
> > +   /* all packets are AF_XDP, so handles its own delete in batch */
>
> This comment should be somewhere else.
>
> BTW, shift right by 1 space.
>
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        xpacket = dp_packet_cast_afxdp(packet);
> > +        if (xpacket->mpool) {
> > +            void *base = dp_packet_base(packet);
> > +
> > +            addr = (unsigned long)base & (~FRAME_SHIFT_MASK);
>
> Shouldn't it be uintptr_t ? Probably in some other places too.

right, I will use uintptr_t here and other places.

>
> > +            elems[i] = (void *)addr;
> > +        }
> > +    }
> > +    umem_elem_push_n(xpacket->mpool, batch->count, elems);
> > +    dp_packet_batch_init(batch);
> > +}
> > +
> > +int
> > +netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch 
> > *batch,
> > +                      int *qfill)
> > +{
> > +    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
> > +    struct netdev *netdev = rx->up.netdev;
> > +    struct netdev_linux *dev = netdev_linux_cast(netdev);
> > +    struct umem_elem *elems[BATCH_SIZE];
> > +    uint32_t idx_rx = 0, idx_fq = 0;
> > +    struct xsk_socket_info *xsk;
> > +    int qid = rxq_->queue_id;
> > +    unsigned int rcvd, i;
> > +    int ret = 0;
> > +
> > +    xsk = dev->xsk[qid];
> > +    rx->fd = xsk_socket__fd(xsk->xsk);
> > +
> > +    /* See if there is any packet on RX queue,
> > +     * if yes, idx_rx is the index having the packet.
> > +     */
> > +    rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
> > +    if (!rcvd) {
> > +        return 0;
> > +    }
> > +
> > +    ret = umem_elem_pop_n(&xsk->umem->mpool, rcvd, (void **)elems);
> > +    if (OVS_UNLIKELY(ret)) {
>
> We need to return rx buffers to mpool before releasing.
> Otherwise they will be lost.
>
>            for (i = 0; i < rcvd; i++) {
>                uint64_t addr = xsk_ring_cons__rx_desc(&xsk->rx, i)->addr;
>
>                elems[i] = xsk_umem__get_data(xsk->umem->buffer, addr);
>            }
>            umem_elem_push_n(&xsk->umem->mpool, rcvd, (void **)elems);
>
> Please, re-check above code snippet before using.

good point, thanks

>
> > +        xsk_ring_cons__release(&xsk->rx, rcvd);
> > +        xsk->rx_dropped += rcvd;
> > +        return ENOMEM;
> > +    }
> > +
> > +    /* Prepare for the FILL queue */
> > +    if (!xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq)) {
> > +        /* The FILL queue is full, don't retry or process rx. Wait for 
> > kernel
> > +         * to move received packets from FILL queue to RX queue.
> > +         */
> > +        umem_elem_push_n(&xsk->umem->mpool, rcvd, (void **)elems);
>
> Same here.
>
> > +        xsk_ring_cons__release(&xsk->rx, rcvd);
> > +        xsk->rx_dropped += rcvd;
> > +        return ENOMEM;
> > +    }
> > +
> > +    /* Setup a dp_packet batch from descriptors in RX queue */
> > +    for (i = 0; i < rcvd; i++) {
> > +        uint64_t addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
> > +        uint32_t len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->len;
> > +        char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
> > +        uint64_t index;
> > +
> > +        struct dp_packet_afxdp *xpacket;
> > +        struct dp_packet *packet;
> > +
> > +        index = addr >> FRAME_SHIFT;
> > +        xpacket = UMEM2XPKT(xsk->umem->xpool.array, index);
> > +        packet = &xpacket->packet;
> > +
> > +        /* Initialize the struct dp_packet */
> > +        dp_packet_use_afxdp(packet, pkt, FRAME_SIZE - FRAME_HEADROOM);
> > +        dp_packet_set_size(packet, len);
> > +
> > +        /* Add packet into batch, increase batch->count */
> > +        dp_packet_batch_add(batch, packet);
> > +
> > +        idx_rx++;
> > +    }
> > +    /* Release the RX queue */
> > +    xsk_ring_cons__release(&xsk->rx, rcvd);
> > +
> > +    for (i = 0; i < rcvd; i++) {
> > +        uint64_t index;
> > +        struct umem_elem *elem;
> > +
> > +        /* Get one free umem, program it into FILL queue */
> > +        elem = elems[i];
> > +        index = (uint64_t)((char *)elem - (char *)xsk->umem->buffer);
> > +        ovs_assert((index & FRAME_SHIFT_MASK) == 0);
> > +        *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq) = index;
> > +
> > +        idx_fq++;
> > +    }
> > +    xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
> > +
> > +    if (qfill) {
> > +        /* TODO: return the number of remaining packets in the queue. */
> > +        *qfill = 0;
> > +    }
> > +
> > +#ifdef AFXDP_DEBUG
> > +    log_xsk_stat(xsk);
> > +#endif
> > +    return 0;
> > +}
> > +
> > +static inline int
> > +kick_tx(struct xsk_socket_info *xsk)
> > +{
> > +    int ret;
> > +
> > +    /* This causes system call into kernel's xsk_sendmsg, and
> > +     * xsk_generic_xmit (skb mode) or xsk_async_xmit (driver mode).
> > +     */
> > +    ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
> > +    if (OVS_UNLIKELY(ret < 0)) {
> > +        if (errno == ENXIO || errno == ENOBUFS || errno == EOPNOTSUPP) {
> > +            return errno;
> > +        }
> > +    }
> > +    /* no error, or EBUSY or EAGAIN */
> > +    return 0;
> > +}
> > +
> > +static inline bool
> > +check_free_batch(struct dp_packet_batch *batch)
> > +{
> > +    struct umem_pool *first_mpool = NULL;
> > +    struct dp_packet_afxdp *xpacket;
> > +    struct dp_packet *packet;
> > +
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        if (packet->source != DPBUF_AFXDP) {
> > +            return false;
> > +        }
> > +        xpacket = dp_packet_cast_afxdp(packet);
> > +        if (i == 0) {
> > +            first_mpool = xpacket->mpool;
> > +            continue;
> > +        }
> > +        if (xpacket->mpool != first_mpool) {
> > +            return false;
> > +        }
> > +    }
> > +    /* All packets are DPBUF_AFXDP and from the same mpool */
> > +    return true;
> > +}
> > +
> > +static inline void
> > +afxdp_complete_tx(struct xsk_socket_info *xsk)
> > +{
> > +    struct umem_elem *elems_push[BATCH_SIZE];
> > +    uint32_t idx_cq = 0;
> > +    int tx_done, j, ret;
> > +
> > +    if (!xsk->outstanding_tx) {
> > +        return;
> > +    }
> > +
> > +    ret = kick_tx(xsk);
> > +    if (OVS_UNLIKELY(ret)) {
> > +        VLOG_WARN_RL(&rl, "error sending AF_XDP packet: %s",
> > +                     ovs_strerror(ret));
> > +    }
> > +
> > +    tx_done = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx_cq);
> > +    if (tx_done > 0) {
> > +        xsk_ring_cons__release(&xsk->umem->cq, tx_done);
> > +        xsk->outstanding_tx -= tx_done;
> > +    }
> > +
> > +    /* Recycle back to umem pool */
> > +    for (j = 0; j < tx_done; j++) {
> > +        struct umem_elem *elem;
> > +        uint64_t addr;
> > +
> > +        addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx_cq++);
> > +        elem = ALIGNED_CAST(struct umem_elem *,
> > +                            (char *)xsk->umem->buffer + addr);
> > +        elems_push[j] = elem;
> > +    }
> > +
> > +    umem_elem_push_n(&xsk->umem->mpool, tx_done, (void **)elems_push);
> > +}
> > +
> > +int
> > +netdev_afxdp_batch_send(struct netdev *netdev_, int qid,
> > +                        struct dp_packet_batch *batch,
> > +                        bool concurrent_txq)
> > +{
> > +    struct netdev_linux *dev = netdev_linux_cast(netdev_);
> > +    struct xsk_socket_info *xsk = dev->xsk[qid];
> > +    struct umem_elem *elems_pop[BATCH_SIZE];
> > +    struct dp_packet *packet;
> > +    bool free_batch = true;
> > +    uint32_t idx = 0;
> > +    int error = 0;
> > +    int ret;
> > +
> > +    if (OVS_UNLIKELY(concurrent_txq)) {
> > +        ovs_spin_lock(&dev->tx_lock);
>
> Using the same lock for all queues will procude a lot of unnecessary
> contentions. It's better to allocate array of locks. One per tx queue.
> You may re-allocate it in reconfigure() implementation.

Right, will do per tx lock in next version.

>
> > +    }
> > +
> > +    /* Process CQ first. */
> > +    afxdp_complete_tx(xsk);
> > +
> > +    free_batch = check_free_batch(batch);
> > +
> > +    ret = umem_elem_pop_n(&xsk->umem->mpool, batch->count, (void 
> > **)elems_pop);
> > +    if (OVS_UNLIKELY(ret)) {
> > +        xsk->tx_dropped += batch->count;
> > +        error = ENOMEM;
> > +        goto out;
> > +    }
> > +
> > +    /* Make sure we have enough TX descs */
> > +    ret = xsk_ring_prod__reserve(&xsk->tx, batch->count, &idx);
> > +    if (OVS_UNLIKELY(ret == 0)) {
> > +        umem_elem_push_n(&xsk->umem->mpool, batch->count, (void 
> > **)elems_pop);
> > +        xsk->tx_dropped += batch->count;
> > +        error = ENOMEM;
> > +        goto out;
> > +    }
> > +
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        struct umem_elem *elem;
> > +        uint64_t index;
> > +
> > +        elem = elems_pop[i];
> > +        /* Copy the packet to the umem we just pop from umem pool.
> > +         * TODO: avoid this copy if the packet and the pop umem
> > +         * are located in the same umem.
> > +         */
> > +        memcpy(elem, dp_packet_data(packet), dp_packet_size(packet));
> > +
> > +        index = (uint64_t)((char *)elem - (char *)xsk->umem->buffer);
> > +        xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr = index;
> > +        xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len
> > +            = dp_packet_size(packet);
> > +    }
> > +    xsk_ring_prod__submit(&xsk->tx, batch->count);
> > +    xsk->outstanding_tx += batch->count;
> > +
> > +    ret = kick_tx(xsk);
> > +    if (OVS_UNLIKELY(ret)) {
> > +        umem_elem_push_n(&xsk->umem->mpool, batch->count, (void 
> > **)elems_pop);
>
> Do we really able to re-use these buffers? They are alredy in tx ring and
> probably will be sent on next kick_tx().
>
Right, so probably I should just print the warning message and continue.

> > +        VLOG_WARN_RL(&rl, "error sending AF_XDP packet: %s",
> > +                     ovs_strerror(ret));
> > +    }
> > +
> > +out:
> > +    if (free_batch) {
> > +        free_afxdp_buf_batch(batch);
> > +    } else {
> > +        dp_packet_delete_batch(batch, true);
> > +    }
> > +
> > +    if (OVS_UNLIKELY(concurrent_txq)) {
> > +        ovs_spin_unlock(&dev->tx_lock);
> > +    }
> > +    return error;
> > +}
> > +
> > +int
> > +netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_ OVS_UNUSED)
> > +{
> > +   /* Done at reconfigure */
> > +   return 0;
> > +}
> > +
> > +void
> > +netdev_afxdp_destruct(struct netdev *netdev_)
> > +{
> > +    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
> > +
> > +    /* Note: tc is by-passed when using drv-mode, but when using
> > +     * skb-mode, we might need to clean up tc. */
> > +
> > +    xsk_destroy_all(netdev_);
> > +    ovs_mutex_destroy(&netdev->mutex);
> > +}
> > +
> > +int
> > +netdev_afxdp_get_stats(const struct netdev *netdev_,
>
> You don't need an underscore here.
>
> > +                       struct netdev_stats *stats)
> > +{
> > +    struct netdev_linux *dev = netdev_linux_cast(netdev_);
> > +    struct netdev_stats dev_stats;
> > +    struct xsk_socket_info *xsk;
> > +    int error, i;
> > +
> > +    ovs_mutex_lock(&dev->mutex);
> > +
> > +    error = get_stats_via_netlink(netdev_, &dev_stats);
> > +    if (error) {
> > +        VLOG_WARN_RL(&rl, "Error getting AF_XDP statistics");
> > +    } else {
> > +        /* Use kernel netdev's packet and byte counts */
> > +        stats->rx_packets = dev_stats.rx_packets;
> > +        stats->rx_bytes = dev_stats.rx_bytes;
> > +        stats->tx_packets = dev_stats.tx_packets;
> > +        stats->tx_bytes = dev_stats.tx_bytes;
> > +
> > +        stats->rx_errors           += dev_stats.rx_errors;
> > +        stats->tx_errors           += dev_stats.tx_errors;
> > +        stats->rx_dropped          += dev_stats.rx_dropped;
> > +        stats->tx_dropped          += dev_stats.tx_dropped;
> > +        stats->multicast           += dev_stats.multicast;
> > +        stats->collisions          += dev_stats.collisions;
> > +        stats->rx_length_errors    += dev_stats.rx_length_errors;
> > +        stats->rx_over_errors      += dev_stats.rx_over_errors;
> > +        stats->rx_crc_errors       += dev_stats.rx_crc_errors;
> > +        stats->rx_frame_errors     += dev_stats.rx_frame_errors;
> > +        stats->rx_fifo_errors      += dev_stats.rx_fifo_errors;
> > +        stats->rx_missed_errors    += dev_stats.rx_missed_errors;
> > +        stats->tx_aborted_errors   += dev_stats.tx_aborted_errors;
> > +        stats->tx_carrier_errors   += dev_stats.tx_carrier_errors;
> > +        stats->tx_fifo_errors      += dev_stats.tx_fifo_errors;
> > +        stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
> > +        stats->tx_window_errors    += dev_stats.tx_window_errors;
> > +
> > +        /* Account the dropped in each xsk */
> > +        for (i = 0; i < MAX_XSKQ; i++) {
>
> i < netdev_n_rxq(netdev)
>
> > +            xsk = dev->xsk[i];
> > +            if (xsk) {
> > +                stats->rx_dropped += xsk->rx_dropped;
> > +                stats->tx_dropped += xsk->tx_dropped;
> > +            }
> > +        }
> > +    }
> > +    ovs_mutex_unlock(&dev->mutex);
> > +
> > +    return error;
> > +}
> > diff --git a/lib/netdev-afxdp.h b/lib/netdev-afxdp.h
> > new file mode 100644
> > index 000000000000..dd2dc1a2064d
> > --- /dev/null
> > +++ b/lib/netdev-afxdp.h
> > @@ -0,0 +1,74 @@
> > +/*
> > + * Copyright (c) 2018, 2019 Nicira, Inc.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#ifndef NETDEV_AFXDP_H
> > +#define NETDEV_AFXDP_H 1
> > +
> > +#include <config.h>
> > +
> > +#ifdef HAVE_AF_XDP
> > +
> > +#include <stdint.h>
> > +#include <stdbool.h>
> > +
> > +/* These functions are Linux AF_XDP specific, so they should be used 
> > directly
> > + * only by Linux-specific code. */
> > +
> > +#define MAX_XSKQ 16
> > +
> > +struct netdev;
> > +struct xsk_socket_info;
> > +struct xdp_umem;
> > +struct dp_packet_batch;
> > +struct smap;
> > +struct dp_packet;
> > +struct netdev_rxq;
> > +struct netdev_stats;
> > +
> > +int netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_);
> > +void netdev_afxdp_destruct(struct netdev *netdev_);
> > +
> > +int netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_,
> > +                          struct dp_packet_batch *batch,
> > +                          int *qfill);
> > +int netdev_afxdp_batch_send(struct netdev *netdev_, int qid,
> > +                            struct dp_packet_batch *batch,
> > +                            bool concurrent_txq);
> > +int netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args,
> > +                            char **errp);
> > +int netdev_afxdp_get_config(const struct netdev *netdev, struct smap 
> > *args);
> > +int netdev_afxdp_get_numa_id(const struct netdev *netdev);
> > +int netdev_afxdp_get_stats(const struct netdev *netdev_,
> > +                           struct netdev_stats *stats);
> > +
> > +void free_afxdp_buf(struct dp_packet *p);
> > +int netdev_afxdp_reconfigure(struct netdev *netdev);
> > +void signal_remove_xdp(struct netdev *netdev);
> > +
> > +#else /* !HAVE_AF_XDP */
> > +
> > +#include "openvswitch/compiler.h"
> > +
> > +struct dp_packet;
> > +
> > +static inline void
> > +free_afxdp_buf(struct dp_packet *p OVS_UNUSED)
> > +{
> > +    /* Nothing */
> > +}
> > +
> > +#endif /* HAVE_AF_XDP */
> > +#endif /* netdev-afxdp.h */
> > diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
> > new file mode 100644
> > index 000000000000..d43f79e6aa41
> > --- /dev/null
> > +++ b/lib/netdev-linux-private.h
> > @@ -0,0 +1,139 @@
> > +/*
> > + * Copyright (c) 2019 Nicira, Inc.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#ifndef NETDEV_LINUX_PRIVATE_H
> > +#define NETDEV_LINUX_PRIVATE_H 1
> > +
> > +#include <config.h>
> > +
> > +#include <linux/filter.h>
> > +#include <linux/gen_stats.h>
> > +#include <linux/if_ether.h>
> > +#include <linux/if_tun.h>
> > +#include <linux/types.h>
> > +#include <linux/ethtool.h>
> > +#include <linux/mii.h>
> > +#include <stdint.h>
> > +#include <stdbool.h>
> > +
> > +#include "netdev-afxdp.h"
> > +#include "netdev-provider.h"
> > +#include "netdev-tc-offloads.h"
> > +#include "netdev-vport.h"
> > +#include "openvswitch/thread.h"
> > +#include "ovs-atomic.h"
> > +#include "timer.h"
> > +#include "xdpsock.h"
> > +
> > +/* These functions are Linux specific, so they should be used directly 
> > only by
> > + * Linux-specific code. */
> > +
> > +struct netdev;
> > +
> > +struct netdev_rxq_linux {
> > +    struct netdev_rxq up;
> > +    bool is_tap;
> > +    int fd;
> > +};
> > +
> > +void netdev_linux_run(const struct netdev_class *);
> > +
> > +int netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
> > +                                  const char *flag_name, bool enable);
> > +
> > +int get_stats_via_netlink(const struct netdev *netdev_,
> > +                          struct netdev_stats *stats);
> > +
> > +struct netdev_linux {
> > +    struct netdev up;
> > +
> > +    /* Protects all members below. */
> > +    struct ovs_mutex mutex;
> > +
> > +    unsigned int cache_valid;
> > +
> > +    bool miimon;                    /* Link status of last poll. */
> > +    long long int miimon_interval;  /* Miimon Poll rate. Disabled if <= 0. 
> > */
> > +    struct timer miimon_timer;
> > +
> > +    int netnsid;                    /* Network namespace ID. */
> > +    /* The following are figured out "on demand" only.  They are only valid
> > +     * when the corresponding VALID_* bit in 'cache_valid' is set. */
> > +    int ifindex;
> > +    struct eth_addr etheraddr;
> > +    int mtu;
> > +    unsigned int ifi_flags;
> > +    long long int carrier_resets;
> > +    uint32_t kbits_rate;        /* Policing data. */
> > +    uint32_t kbits_burst;
> > +    int vport_stats_error;      /* Cached error code from 
> > vport_get_stats().
> > +                                   0 or an errno value. */
> > +    int netdev_mtu_error;       /* Cached error code from SIOCGIFMTU
> > +                                 * or SIOCSIFMTU.
> > +                                 */
> > +    int ether_addr_error;       /* Cached error code from set/get 
> > etheraddr. */
> > +    int netdev_policing_error;  /* Cached error code from set policing. */
> > +    int get_features_error;     /* Cached error code from ETHTOOL_GSET. */
> > +    int get_ifindex_error;      /* Cached error code from SIOCGIFINDEX. */
> > +
> > +    enum netdev_features current;    /* Cached from ETHTOOL_GSET. */
> > +    enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
> > +    enum netdev_features supported;  /* Cached from ETHTOOL_GSET. */
> > +
> > +    struct ethtool_drvinfo drvinfo;  /* Cached from ETHTOOL_GDRVINFO. */
> > +    struct tc *tc;
> > +
> > +    /* For devices of class netdev_tap_class only. */
> > +    int tap_fd;
> > +    bool present;               /* If the device is present in the 
> > namespace */
> > +    uint64_t tx_dropped;        /* tap device can drop if the iface is 
> > down */
> > +
> > +    /* LAG information. */
> > +    bool is_lag_master;         /* True if the netdev is a LAG master. */
> > +
> > +    /* AF_XDP information */
> > +#ifdef HAVE_AF_XDP
> > +    struct xsk_socket_info *xsk[MAX_XSKQ];
>
> You may allocate this array dynamically based on the n_rxq while performing
> reconfiguration. This way you will also have no limit on the number of rxqs.

make sense, thanks.

<snip>

> > --- /dev/null
> > +++ b/lib/spinlock.h
> > @@ -0,0 +1,70 @@
> > +/*
> > + * Copyright (c) 2018, 2019 Nicira, Inc.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +#ifndef SPINLOCK_H
> > +#define SPINLOCK_H 1
> > +
> > +#include <config.h>
> > +
> > +#include <ctype.h>
> > +#include <errno.h>
> > +#include <fcntl.h>
> > +#include <stdarg.h>
> > +#include <stdlib.h>
> > +#include <unistd.h>
> > +
> > +#include "ovs-atomic.h"
> > +
> > +typedef struct {
>
> It's probably better to not use 'typedef'. OVS doesn't use
> typedefs for structures, unions and enums usually.
> For example we have no typedef for 'struct ovs_mutex'.
> So, this should be just 'struct ovs_spinlock'.
>
>
> We may also add some annotations like OVS_LOCKABLE and clang
> thread safety annotations: OVS_ACQUIRES, OVS_TRY_LOCK, OVS_RELEASES.
> However, this could be done later.
>

OK, will do it.

> > +    atomic_int locked;
> > +} ovs_spinlock_t;> +
> > +static inline void
> > +ovs_spinlock_init(ovs_spinlock_t *sl)
> > +{
> > +    atomic_init(&sl->locked, 0);
> > +}
> > +
> > +static inline void
> > +ovs_spin_lock(ovs_spinlock_t *sl)
> > +{
> > +    int exp = 0, locked = 0;
> > +
> > +    while (!atomic_compare_exchange_strong_explicit(&sl->locked, &exp, 1,
> > +                memory_order_acquire,
> > +                memory_order_relaxed)) {
> > +        locked = 1;
> > +        while (locked) {
> > +            atomic_read_relaxed(&sl->locked, &locked);
> > +        }
> > +        exp = 0;
> > +    }
> > +}
> > +
> > +static inline void
> > +ovs_spin_unlock(ovs_spinlock_t *sl)
> > +{
> > +    atomic_store_explicit(&sl->locked, 0, memory_order_release);
> > +}
> > +
> > +static inline int OVS_UNUSED
>
> Not sure that we need UNUSED annotation since we're in header now.
>
> > +ovs_spin_trylock(ovs_spinlock_t *sl)
> > +{
> > +    int exp = 0;
> > +    return atomic_compare_exchange_strong_explicit(&sl->locked, &exp, 1,
> > +                memory_order_acquire,
> > +                memory_order_relaxed);
> > +}
> > +#endif
> > diff --git a/lib/util.c b/lib/util.c
> > index 5679232ffc5f..060b1e287bce 100644
> > --- a/lib/util.c
> > +++ b/lib/util.c
> > @@ -277,6 +277,49 @@ free_cacheline(void *p)
> >  #endif
> >  }
> >
> > +#ifdef HAVE_AF_XDP
>
> I don't think that we need 'ifdef' here.
>
> How about re-naming 'xmalloc_cacheline' to 'xmalloc_size_align'
> making it allocate memory aligned to a specified size and in
> a dedicated cachelines?
>
> And implement two functions:
> xmalloc_cacheline(size)
> {
>     return xmalloc_size_align(size, CACHE_LINE_SIZE);
> }
> xmalloc_pagealign(size)
> {
>     return xmalloc_size_align(size, get_page_size());
> }
>

I think it's better, will do it.

> > +void *
> > +xmalloc_pagealign(size_t size)
> > +{
> > +#ifdef HAVE_POSIX_MEMALIGN
> > +    void *p;
> > +    int error;
> > +
> > +    COVERAGE_INC(util_xalloc);
> > +    error = posix_memalign(&p, get_page_size(), size ? size : 1);
> > +    if (error != 0) {
> > +        out_of_memory();
> > +    }
> > +    return p;
> > +#else
> > +    /* Similar to xmalloc_cacheline, but replace
> > +     * CACHE_LINE_SIZE with get_page_size() */
> > +    void *p = xmalloc((get_page_size() - 1)
> > +                      + sizeof(void *)
> > +                      + ROUND_UP(size, get_page_size()));
>
> I think that you don't need to round up to a page size.
> You need to round up to a CACHE_LINE_SIZE, probably.
> There is no point to allocate so much memory more.
> Below code should be re-checked too.
>

Right, in worst case we should waste (page_size() - 1) bytes.

> > +    bool runt = PAD_SIZE((uintptr_t) p, get_page_size()) < sizeof(void *);
> > +    void *r = (void *) ROUND_UP((uintptr_t) p + (runt ? get_page_size() : 
> > 0),
> > +                                get_page_size());
> > +    void **q = (void **) r - 1;
> > +    *q = p;
> > +    return r;
> > +#endif
> > +}
> > +
> > +void
> > +free_pagealign(void *p)
> > +{
> > +#ifdef HAVE_POSIX_MEMALIGN
> > +    free(p);
> > +#else
> > +    if (p) {
> > +        void **q = (void **) p - 1;
> > +        free(*q);
> > +    }
> > +#endif
> > +}
> > +#endif
> > +
> >  char *
> >  xasprintf(const char *format, ...)
> >  {
> > diff --git a/lib/util.h b/lib/util.h
> > index 53354f1c6f0f..3cd8cf87fba8 100644
> > --- a/lib/util.h
> > +++ b/lib/util.h
> > @@ -163,6 +163,11 @@ void ovs_strzcpy(char *dst, const char *src, size_t 
> > size);
> >
> >  int string_ends_with(const char *str, const char *suffix);
> >
> > +#ifdef HAVE_AF_XDP
> > +void *xmalloc_pagealign(size_t) MALLOC_LIKE;
> > +void free_pagealign(void *);
> > +#endif
> > +
> >  /* The C standards say that neither the 'dst' nor 'src' argument to
> >   * memcpy() may be null, even if 'n' is zero.  This wrapper tolerates
> >   * the null case. */
> > diff --git a/lib/xdpsock.c b/lib/xdpsock.c
> > new file mode 100644
> > index 000000000000..ffdb54dfcd27
> > --- /dev/null
> > +++ b/lib/xdpsock.c
> > @@ -0,0 +1,179 @@
> > +/*
> > + * Copyright (c) 2018, 2019 Nicira, Inc.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +#include <config.h>
> > +
> > +#include "xdpsock.h"
> > +#include "dp-packet.h"
> > +#include "openvswitch/compiler.h"
> > +
> > +/* Note:
> > + * umem_elem_push* shouldn't overflow because we always pop
> > + * elem first, then push back to the stack.
> > + */
> > +static inline void
> > +__umem_elem_push_n(struct umem_pool *umemp, int n, void **addrs)
> > +{
> > +    void *ptr;
> > +
> > +    if (OVS_UNLIKELY(umemp->index + n > umemp->size)) {
> > +        OVS_NOT_REACHED();
> > +    }
> > +
> > +    ptr = &umemp->array[umemp->index];
> > +    memcpy(ptr, addrs, n * sizeof(void *));
> > +    umemp->index += n;
> > +}
> > +
> > +void umem_elem_push_n(struct umem_pool *umemp, int n, void **addrs)
> > +{
> > +    ovs_spin_lock(&umemp->mutex);
> > +    __umem_elem_push_n(umemp, n, addrs);
> > +    ovs_spin_unlock(&umemp->mutex);
> > +}
> > +
> > +static inline void
> > +__umem_elem_push(struct umem_pool *umemp, void *addr)
> > +{
> > +    if (OVS_UNLIKELY(umemp->index + 1) > umemp->size) {
> > +        OVS_NOT_REACHED();
> > +    }
> > +
> > +    umemp->array[umemp->index++] = addr;
> > +}
> > +
> > +void
> > +umem_elem_push(struct umem_pool *umemp, void *addr)
> > +{
> > +
> > +    ovs_assert(((uint64_t)addr & FRAME_SHIFT_MASK) == 0);
> > +
> > +    ovs_spin_lock(&umemp->mutex);
> > +    __umem_elem_push(umemp, addr);
> > +    ovs_spin_unlock(&umemp->mutex);
> > +}
> > +
> > +static inline int
> > +__umem_elem_pop_n(struct umem_pool *umemp, int n, void **addrs)
> > +{
> > +    void *ptr;
> > +
> > +    if (OVS_UNLIKELY(umemp->index - n < 0)) {
> > +        return -ENOMEM;
> > +    }
> > +
> > +    umemp->index -= n;
> > +    ptr = &umemp->array[umemp->index];
> > +    memcpy(addrs, ptr, n * sizeof(void *));
> > +
> > +    return 0;
> > +}
> > +
> > +int
> > +umem_elem_pop_n(struct umem_pool *umemp, int n, void **addrs)
> > +{
> > +    int ret;
> > +
> > +    ovs_spin_lock(&umemp->mutex);
> > +    ret = __umem_elem_pop_n(umemp, n, addrs);
> > +    ovs_spin_unlock(&umemp->mutex);
> > +
> > +    return ret;
> > +}
> > +
> > +static inline void *
> > +__umem_elem_pop(struct umem_pool *umemp)
> > +{
> > +    if (OVS_UNLIKELY(umemp->index - 1 < 0)) {
> > +        return NULL;
> > +    }
> > +
> > +    return umemp->array[--umemp->index];
> > +}
> > +
> > +void *
> > +umem_elem_pop(struct umem_pool *umemp)
> > +{
> > +    void *ptr;
> > +
> > +    ovs_spin_lock(&umemp->mutex);
> > +    ptr = __umem_elem_pop(umemp);
> > +    ovs_spin_unlock(&umemp->mutex);
> > +
> > +    return ptr;
> > +}
> > +
> > +static void **
> > +__umem_pool_alloc(unsigned int size)
> > +{
> > +    void *bufs;
> > +    int ret;
> > +
> > +    ret = posix_memalign(&bufs, getpagesize(),
> > +                         size * sizeof(void *));
>
> xmalloc_pagealign ?
>
> > +    if (ret) {
> > +        return NULL;
> > +    }
> > +
> > +    memset(bufs, 0, size * sizeof(void *));
> > +    return (void **)bufs;
> > +}
> > +
> > +int
> > +umem_pool_init(struct umem_pool *umemp, unsigned int size)
> > +{
> > +    umemp->array = __umem_pool_alloc(size);
> > +    if (!umemp->array) {
> > +        return -ENOMEM;
> > +    }
> > +
> > +    umemp->size = size;
> > +    umemp->index = 0;
> > +    ovs_spinlock_init(&umemp->mutex);
> > +    return 0;
> > +}
> > +
> > +void
> > +umem_pool_cleanup(struct umem_pool *umemp)
> > +{
> > +    free(umemp->array);
>
> free_pagealign ?
>
> > +    umemp->array = NULL;
> > +}
> > +
> > +/* AF_XDP metadata init/destroy */
> > +int
> > +xpacket_pool_init(struct xpacket_pool *xp, unsigned int size)
> > +{
> > +    void *bufs;
> > +    int ret;
> > +
> > +    ret = posix_memalign(&bufs, getpagesize(),
> > +                         size * sizeof(struct dp_packet_afxdp));
> > +    if (ret) {
> > +        return -ENOMEM;
> > +    }
> > +    memset(bufs, 0, size * sizeof(struct dp_packet_afxdp));
> > +
> > +    xp->array = bufs;
> > +    xp->size = size;
> > +    return 0;
> > +}
> > +
> > +void
> > +xpacket_pool_cleanup(struct xpacket_pool *xp)
> > +{
> > +    free(xp->array);
> > +    xp->array = NULL;
> > +}
> > diff --git a/lib/xdpsock.h b/lib/xdpsock.h
> > new file mode 100644
> > index 000000000000..72578e383812
> > --- /dev/null
> > +++ b/lib/xdpsock.h
> > @@ -0,0 +1,101 @@
> > +/*
> > + * Copyright (c) 2018, 2019 Nicira, Inc.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#ifndef XDPSOCK_H
> > +#define XDPSOCK_H 1
> > +
> > +#include <config.h>
> > +
> > +#ifdef HAVE_AF_XDP
> > +
> > +#include <bpf/xsk.h>
> > +#include <errno.h>
> > +#include <stdbool.h>
> > +#include <stdio.h>
> > +
> > +#include "openvswitch/thread.h"
> > +#include "ovs-atomic.h"
> > +#include "spinlock.h"
> > +
> > +#define FRAME_HEADROOM  XDP_PACKET_HEADROOM
> > +#define FRAME_SIZE      XSK_UMEM__DEFAULT_FRAME_SIZE
> > +#define FRAME_SHIFT     XSK_UMEM__DEFAULT_FRAME_SHIFT
> > +#define FRAME_SHIFT_MASK    ((1 << FRAME_SHIFT) - 1)
> > +
> > +#define PROD_NUM_DESCS XSK_RING_PROD__DEFAULT_NUM_DESCS
> > +#define CONS_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS
> > +
> > +/* The worst case is all 4 queues TX/CQ/RX/FILL are full.
> > + * Setting NUM_FRAMES to this makes sure umem_pop always successes.
> > + */
> > +#define NUM_FRAMES      (2 * (PROD_NUM_DESCS + CONS_NUM_DESCS))
> > +
> > +#define BATCH_SIZE      NETDEV_MAX_BURST
> > +
> > +BUILD_ASSERT_DECL(IS_POW2(NUM_FRAMES));
> > +BUILD_ASSERT_DECL(PROD_NUM_DESCS == CONS_NUM_DESCS);
> > +BUILD_ASSERT_DECL(NUM_FRAMES == 2 * (PROD_NUM_DESCS + CONS_NUM_DESCS));
> > +
> > +/* LIFO ptr_array */
> > +struct umem_pool {
> > +    int index;      /* point to top */
> > +    unsigned int size;
> > +    ovs_spinlock_t mutex;
>
> It's a bit confusing to name it a 'mutex'. Sounds like it's 'ovs_mutex'.
> Probably, it'll be better to name it 'spinlock' or just 'lock'.
>
OK

<snip>

Regards,
William
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to