Hi Ilya, Thanks for your review.
On Thu, May 30, 2019 at 8:57 AM Ilya Maximets <[email protected]> wrote: > > On 28.05.2019 22:01, William Tu wrote: > > The patch introduces experimental AF_XDP support for OVS netdev. > > diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h > > index 859c05613ddf..a33b9a7353ba 100644 > > --- a/lib/dpif-netdev-perf.h > > +++ b/lib/dpif-netdev-perf.h > > @@ -21,6 +21,7 @@ > > #include <stddef.h> > > #include <stdint.h> > > #include <string.h> > > +#include <time.h> > > #include <math.h> > > > > #ifdef DPDK_NETDEV > > @@ -186,6 +187,24 @@ struct pmd_perf_stats { > > char *log_reason; > > }; > > > > +#ifdef HAVE_AF_XDP > > I'd like to change this to "#ifdef __linux__". > 'clock_gettime' is posix compliant, but CLOCK_MONOTONIC_RAW is > Linux specific. Yes, thanks, will do it. > > > +static inline uint64_t > > +rdtsc_syscall(struct pmd_perf_stats *s) > > +{ > > + struct timespec val; > > + uint64_t v; > > + > > + if (clock_gettime(CLOCK_MONOTONIC_RAW, &val) != 0) { > > + return s->last_tsc = 0; > > Maybe it's better to just return the value and allow caller to assign? Do you mean just: return s->last_tsc; > This way you'll not need to pass any arguments here. I don't understand, I still need to pass &val, right? > > > + } > > + > > + v = (uint64_t) val.tv_sec * 1000000000LL; > > + v += (uint64_t) val.tv_nsec; > > + > > + return s->last_tsc = v; > > +} > > +#endif > > + > > /* Support for accurate timing of PMD execution on TSC clock cycle level. > > * These functions are intended to be invoked in the context of pmd > > threads. */ > > > > @@ -198,6 +217,15 @@ cycles_counter_update(struct pmd_perf_stats *s) > > { > > #ifdef DPDK_NETDEV > > return s->last_tsc = rte_get_tsc_cycles(); > > +#elif defined(HAVE_AF_XDP) && defined(__x86_64__) > > And this should be: > #elif !defined(_MSC_VER) && defined(__x86_64__) > > Visual Studio doesn't support inline assembly this way. > Other things are portable until we're on x86_64. right, thanks! > > > + /* This is x86-specific instructions. */ > > + uint32_t h, l; > > + asm volatile("rdtsc" : "=a" (l), "=d" (h)); > > + > > + return s->last_tsc = ((uint64_t) h << 32) | l; > > +#elif defined(HAVE_AF_XDP) > > #elif defined(__linux__) > > > + /* non-x86_64 architecture uses syscall */ > > + return rdtsc_syscall(s); > > #else > > return s->last_tsc = 0; > > #endif > > diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c > > new file mode 100644 <snip> > > + > > +static void > > +xsk_destroy_all(struct netdev *netdev) > > +{ > > + struct netdev_linux *dev = netdev_linux_cast(netdev); > > + int i, ifindex; > > + > > + ifindex = linux_get_ifindex(netdev_get_name(netdev)); > > + > > + for (i = 0; i < MAX_XSKQ; i++) { > > + if (dev->xsk[i]) { > > + VLOG_INFO("destroy xsk[%d]", i); > > + xsk_destroy(dev->xsk[i]); > > + dev->xsk[i] = NULL; > > + dev->xsk[i]->rx_dropped = 0; > > + dev->xsk[i]->tx_dropped = 0; > > Dereferencing of a just assigned NULL poiner. Something is definitely > wrong here. oh, thanks a lot. > > > + } > > + } > > + VLOG_INFO("remove xdp program"); > > + xsk_remove_xdp_program(ifindex, dev->xdpmode); > > +} > > + > > +static inline void OVS_UNUSED > > +log_xsk_stat(struct xsk_socket_info *xsk OVS_UNUSED) { > > + struct xdp_statistics stat; > > + socklen_t optlen; > > + > > + optlen = sizeof stat; > > + ovs_assert(getsockopt(xsk_socket__fd(xsk->xsk), SOL_XDP, > > XDP_STATISTICS, > > + &stat, &optlen) == 0); > > + > > + VLOG_DBG_RL(&rl, "rx dropped %llu, rx_invalid %llu, tx_invalid %llu", > > + stat.rx_dropped, > > + stat.rx_invalid_descs, > > + stat.tx_invalid_descs); > > +} > > + > > +int > > +netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args, > > + char **errp OVS_UNUSED) > > +{ > > + struct netdev_linux *dev = netdev_linux_cast(netdev); > > + const char *str_xdpmode; > > + int xdpmode, new_n_rxq; > > + > > + ovs_mutex_lock(&dev->mutex); > > + new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1); > > + if (new_n_rxq > MAX_XSKQ) { > > + ovs_mutex_unlock(&dev->mutex); > > + VLOG_ERR("%s: Too big 'n_rxq' (%d > %d).", > > + netdev_get_name(netdev), new_n_rxq, MAX_XSKQ); > > + return EINVAL; > > + } > > + > > + str_xdpmode = smap_get_def(args, "xdpmode", "skb"); > > + if (!strcasecmp(str_xdpmode, "drv")) { > > + xdpmode = XDP_ZEROCOPY; > > + } else if (!strcasecmp(str_xdpmode, "skb")) { > > + xdpmode = XDP_COPY; > > + } else { > > + VLOG_ERR("%s: Incorrect xdpmode (%s).", > > + netdev_get_name(netdev), str_xdpmode); > > + ovs_mutex_unlock(&dev->mutex); > > + return EINVAL; > > + } > > + > > + if (dev->requested_n_rxq != new_n_rxq > > + || dev->requested_xdpmode != xdpmode) { > > + dev->requested_n_rxq = new_n_rxq; > > + dev->requested_xdpmode = xdpmode; > > + netdev_request_reconfigure(netdev); > > + } > > + ovs_mutex_unlock(&dev->mutex); > > + return 0; > > +} > > + > > +int > > +netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args) > > +{ > > + struct netdev_linux *dev = netdev_linux_cast(netdev); > > + > > + ovs_mutex_lock(&dev->mutex); > > + smap_add_format(args, "n_rxq", "%d", netdev->n_rxq); > > + smap_add_format(args, "xdpmode", "%s", > > + dev->xdp_bind_flags == XDP_ZEROCOPY ? "drv" : "skb"); > > + ovs_mutex_unlock(&dev->mutex); > > + return 0; > > +} > > + > > +int > > +netdev_afxdp_reconfigure(struct netdev *netdev) > > +{ > > + struct netdev_linux *dev = netdev_linux_cast(netdev); > > + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; > > + int err = 0; > > + > > + ovs_mutex_lock(&dev->mutex); > > + > > + if (netdev->n_rxq == dev->requested_n_rxq > > + && dev->xdpmode == dev->requested_xdpmode) { > > + goto out; > > + } > > + > > + xsk_destroy_all(netdev); > > + netdev->n_rxq = dev->requested_n_rxq; > > + > > + if (dev->requested_xdpmode == XDP_ZEROCOPY) { > > + VLOG_INFO("AF_XDP device %s in DRV mode", netdev_get_name(netdev)); > > + /* From SKB mode to DRV mode */ > > + dev->xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_DRV_MODE; > > + dev->xdp_bind_flags = XDP_ZEROCOPY; > > + dev->xdpmode = XDP_ZEROCOPY; > > + > > + if (setrlimit(RLIMIT_MEMLOCK, &r)) { > > + VLOG_ERR("ERROR: setrlimit(RLIMIT_MEMLOCK): %s", > > + ovs_strerror(errno)); > > + } > > + } else { > > + VLOG_INFO("AF_XDP device %s in SKB mode", netdev_get_name(netdev)); > > + /* From DRV mode to SKB mode */ > > + dev->xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_SKB_MODE; > > + dev->xdp_bind_flags = XDP_COPY; > > + dev->xdpmode = XDP_COPY; > > + /* TODO: set rlimit back to previous value > > + * when no device is in DRV mode. > > + */ > > + } > > + > > + err = xsk_configure_all(netdev); > > + if (err) { > > + VLOG_ERR("AF_XDP device %s reconfig fails", > > netdev_get_name(netdev)); > > + } > > + netdev_change_seq_changed(netdev); > > +out: > > + ovs_mutex_unlock(&dev->mutex); > > + return err; > > +} > > + > > +int > > +netdev_afxdp_get_numa_id(const struct netdev *netdev) > > +{ > > + /* FIXME: Get netdev's PCIe device ID, then find > > + * its NUMA node id. > > + */ > > + VLOG_INFO("FIXME: Device %s always use numa id 0", > > + netdev_get_name(netdev)); > > + return 0; > > +} > > + > > +static void > > +xsk_remove_xdp_program(uint32_t ifindex, int xdpmode) > > +{ > > + uint32_t curr_prog_id = 0; > > + uint32_t flags; > > + > > + /* remove_xdp_program() */ > > + if (xdpmode == XDP_COPY) { > > + flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_SKB_MODE; > > + } else { > > + flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_DRV_MODE; > > + } > > + > > + if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, flags)) { > > + bpf_set_link_xdp_fd(ifindex, -1, flags); > > + } > > + if (prog_id == curr_prog_id) { > > + bpf_set_link_xdp_fd(ifindex, -1, flags); > > + } else if (!curr_prog_id) { > > + VLOG_INFO("couldn't find a prog id on a given interface"); > > + } else { > > + VLOG_INFO("program on interface changed, not removing"); > > + } > > +} > > + > > +void > > +signal_remove_xdp(struct netdev *netdev) > > +{> + struct netdev_linux *dev = netdev_linux_cast(netdev); > > + int ifindex; > > + > > + ifindex = linux_get_ifindex(netdev_get_name(netdev)); > > + > > + VLOG_WARN("force remove xdp program"); > > + xsk_remove_xdp_program(ifindex, dev->xdpmode); > > +} > > + > > +static struct dp_packet_afxdp * > > +dp_packet_cast_afxdp(const struct dp_packet *d) > > +{ > > + ovs_assert(d->source == DPBUF_AFXDP); > > + return CONTAINER_OF(d, struct dp_packet_afxdp, packet); > > +} > > + > > +void > > +free_afxdp_buf(struct dp_packet *p) > > +{ > > + struct dp_packet_afxdp *xpacket; > > + unsigned long addr; > > + > > + xpacket = dp_packet_cast_afxdp(p); > > + if (xpacket->mpool) { > > + void *base = dp_packet_base(p); > > + > > + addr = (unsigned long)base & (~FRAME_SHIFT_MASK); > > + umem_elem_push(xpacket->mpool, (void *)addr); > > + } > > +} > > + > > +static void > > +free_afxdp_buf_batch(struct dp_packet_batch *batch) > > +{ > > + struct dp_packet_afxdp *xpacket = NULL; > > + struct dp_packet *packet; > > + void *elems[BATCH_SIZE]; > > + unsigned long addr; > > + > > + /* all packets are AF_XDP, so handles its own delete in batch */ > > This comment should be somewhere else. > > BTW, shift right by 1 space. > > > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > > + xpacket = dp_packet_cast_afxdp(packet); > > + if (xpacket->mpool) { > > + void *base = dp_packet_base(packet); > > + > > + addr = (unsigned long)base & (~FRAME_SHIFT_MASK); > > Shouldn't it be uintptr_t ? Probably in some other places too. right, I will use uintptr_t here and other places. > > > + elems[i] = (void *)addr; > > + } > > + } > > + umem_elem_push_n(xpacket->mpool, batch->count, elems); > > + dp_packet_batch_init(batch); > > +} > > + > > +int > > +netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch > > *batch, > > + int *qfill) > > +{ > > + struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); > > + struct netdev *netdev = rx->up.netdev; > > + struct netdev_linux *dev = netdev_linux_cast(netdev); > > + struct umem_elem *elems[BATCH_SIZE]; > > + uint32_t idx_rx = 0, idx_fq = 0; > > + struct xsk_socket_info *xsk; > > + int qid = rxq_->queue_id; > > + unsigned int rcvd, i; > > + int ret = 0; > > + > > + xsk = dev->xsk[qid]; > > + rx->fd = xsk_socket__fd(xsk->xsk); > > + > > + /* See if there is any packet on RX queue, > > + * if yes, idx_rx is the index having the packet. > > + */ > > + rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); > > + if (!rcvd) { > > + return 0; > > + } > > + > > + ret = umem_elem_pop_n(&xsk->umem->mpool, rcvd, (void **)elems); > > + if (OVS_UNLIKELY(ret)) { > > We need to return rx buffers to mpool before releasing. > Otherwise they will be lost. > > for (i = 0; i < rcvd; i++) { > uint64_t addr = xsk_ring_cons__rx_desc(&xsk->rx, i)->addr; > > elems[i] = xsk_umem__get_data(xsk->umem->buffer, addr); > } > umem_elem_push_n(&xsk->umem->mpool, rcvd, (void **)elems); > > Please, re-check above code snippet before using. good point, thanks > > > + xsk_ring_cons__release(&xsk->rx, rcvd); > > + xsk->rx_dropped += rcvd; > > + return ENOMEM; > > + } > > + > > + /* Prepare for the FILL queue */ > > + if (!xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq)) { > > + /* The FILL queue is full, don't retry or process rx. Wait for > > kernel > > + * to move received packets from FILL queue to RX queue. > > + */ > > + umem_elem_push_n(&xsk->umem->mpool, rcvd, (void **)elems); > > Same here. > > > + xsk_ring_cons__release(&xsk->rx, rcvd); > > + xsk->rx_dropped += rcvd; > > + return ENOMEM; > > + } > > + > > + /* Setup a dp_packet batch from descriptors in RX queue */ > > + for (i = 0; i < rcvd; i++) { > > + uint64_t addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; > > + uint32_t len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->len; > > + char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); > > + uint64_t index; > > + > > + struct dp_packet_afxdp *xpacket; > > + struct dp_packet *packet; > > + > > + index = addr >> FRAME_SHIFT; > > + xpacket = UMEM2XPKT(xsk->umem->xpool.array, index); > > + packet = &xpacket->packet; > > + > > + /* Initialize the struct dp_packet */ > > + dp_packet_use_afxdp(packet, pkt, FRAME_SIZE - FRAME_HEADROOM); > > + dp_packet_set_size(packet, len); > > + > > + /* Add packet into batch, increase batch->count */ > > + dp_packet_batch_add(batch, packet); > > + > > + idx_rx++; > > + } > > + /* Release the RX queue */ > > + xsk_ring_cons__release(&xsk->rx, rcvd); > > + > > + for (i = 0; i < rcvd; i++) { > > + uint64_t index; > > + struct umem_elem *elem; > > + > > + /* Get one free umem, program it into FILL queue */ > > + elem = elems[i]; > > + index = (uint64_t)((char *)elem - (char *)xsk->umem->buffer); > > + ovs_assert((index & FRAME_SHIFT_MASK) == 0); > > + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq) = index; > > + > > + idx_fq++; > > + } > > + xsk_ring_prod__submit(&xsk->umem->fq, rcvd); > > + > > + if (qfill) { > > + /* TODO: return the number of remaining packets in the queue. */ > > + *qfill = 0; > > + } > > + > > +#ifdef AFXDP_DEBUG > > + log_xsk_stat(xsk); > > +#endif > > + return 0; > > +} > > + > > +static inline int > > +kick_tx(struct xsk_socket_info *xsk) > > +{ > > + int ret; > > + > > + /* This causes system call into kernel's xsk_sendmsg, and > > + * xsk_generic_xmit (skb mode) or xsk_async_xmit (driver mode). > > + */ > > + ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); > > + if (OVS_UNLIKELY(ret < 0)) { > > + if (errno == ENXIO || errno == ENOBUFS || errno == EOPNOTSUPP) { > > + return errno; > > + } > > + } > > + /* no error, or EBUSY or EAGAIN */ > > + return 0; > > +} > > + > > +static inline bool > > +check_free_batch(struct dp_packet_batch *batch) > > +{ > > + struct umem_pool *first_mpool = NULL; > > + struct dp_packet_afxdp *xpacket; > > + struct dp_packet *packet; > > + > > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > > + if (packet->source != DPBUF_AFXDP) { > > + return false; > > + } > > + xpacket = dp_packet_cast_afxdp(packet); > > + if (i == 0) { > > + first_mpool = xpacket->mpool; > > + continue; > > + } > > + if (xpacket->mpool != first_mpool) { > > + return false; > > + } > > + } > > + /* All packets are DPBUF_AFXDP and from the same mpool */ > > + return true; > > +} > > + > > +static inline void > > +afxdp_complete_tx(struct xsk_socket_info *xsk) > > +{ > > + struct umem_elem *elems_push[BATCH_SIZE]; > > + uint32_t idx_cq = 0; > > + int tx_done, j, ret; > > + > > + if (!xsk->outstanding_tx) { > > + return; > > + } > > + > > + ret = kick_tx(xsk); > > + if (OVS_UNLIKELY(ret)) { > > + VLOG_WARN_RL(&rl, "error sending AF_XDP packet: %s", > > + ovs_strerror(ret)); > > + } > > + > > + tx_done = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx_cq); > > + if (tx_done > 0) { > > + xsk_ring_cons__release(&xsk->umem->cq, tx_done); > > + xsk->outstanding_tx -= tx_done; > > + } > > + > > + /* Recycle back to umem pool */ > > + for (j = 0; j < tx_done; j++) { > > + struct umem_elem *elem; > > + uint64_t addr; > > + > > + addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx_cq++); > > + elem = ALIGNED_CAST(struct umem_elem *, > > + (char *)xsk->umem->buffer + addr); > > + elems_push[j] = elem; > > + } > > + > > + umem_elem_push_n(&xsk->umem->mpool, tx_done, (void **)elems_push); > > +} > > + > > +int > > +netdev_afxdp_batch_send(struct netdev *netdev_, int qid, > > + struct dp_packet_batch *batch, > > + bool concurrent_txq) > > +{ > > + struct netdev_linux *dev = netdev_linux_cast(netdev_); > > + struct xsk_socket_info *xsk = dev->xsk[qid]; > > + struct umem_elem *elems_pop[BATCH_SIZE]; > > + struct dp_packet *packet; > > + bool free_batch = true; > > + uint32_t idx = 0; > > + int error = 0; > > + int ret; > > + > > + if (OVS_UNLIKELY(concurrent_txq)) { > > + ovs_spin_lock(&dev->tx_lock); > > Using the same lock for all queues will procude a lot of unnecessary > contentions. It's better to allocate array of locks. One per tx queue. > You may re-allocate it in reconfigure() implementation. Right, will do per tx lock in next version. > > > + } > > + > > + /* Process CQ first. */ > > + afxdp_complete_tx(xsk); > > + > > + free_batch = check_free_batch(batch); > > + > > + ret = umem_elem_pop_n(&xsk->umem->mpool, batch->count, (void > > **)elems_pop); > > + if (OVS_UNLIKELY(ret)) { > > + xsk->tx_dropped += batch->count; > > + error = ENOMEM; > > + goto out; > > + } > > + > > + /* Make sure we have enough TX descs */ > > + ret = xsk_ring_prod__reserve(&xsk->tx, batch->count, &idx); > > + if (OVS_UNLIKELY(ret == 0)) { > > + umem_elem_push_n(&xsk->umem->mpool, batch->count, (void > > **)elems_pop); > > + xsk->tx_dropped += batch->count; > > + error = ENOMEM; > > + goto out; > > + } > > + > > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > > + struct umem_elem *elem; > > + uint64_t index; > > + > > + elem = elems_pop[i]; > > + /* Copy the packet to the umem we just pop from umem pool. > > + * TODO: avoid this copy if the packet and the pop umem > > + * are located in the same umem. > > + */ > > + memcpy(elem, dp_packet_data(packet), dp_packet_size(packet)); > > + > > + index = (uint64_t)((char *)elem - (char *)xsk->umem->buffer); > > + xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr = index; > > + xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len > > + = dp_packet_size(packet); > > + } > > + xsk_ring_prod__submit(&xsk->tx, batch->count); > > + xsk->outstanding_tx += batch->count; > > + > > + ret = kick_tx(xsk); > > + if (OVS_UNLIKELY(ret)) { > > + umem_elem_push_n(&xsk->umem->mpool, batch->count, (void > > **)elems_pop); > > Do we really able to re-use these buffers? They are alredy in tx ring and > probably will be sent on next kick_tx(). > Right, so probably I should just print the warning message and continue. > > + VLOG_WARN_RL(&rl, "error sending AF_XDP packet: %s", > > + ovs_strerror(ret)); > > + } > > + > > +out: > > + if (free_batch) { > > + free_afxdp_buf_batch(batch); > > + } else { > > + dp_packet_delete_batch(batch, true); > > + } > > + > > + if (OVS_UNLIKELY(concurrent_txq)) { > > + ovs_spin_unlock(&dev->tx_lock); > > + } > > + return error; > > +} > > + > > +int > > +netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_ OVS_UNUSED) > > +{ > > + /* Done at reconfigure */ > > + return 0; > > +} > > + > > +void > > +netdev_afxdp_destruct(struct netdev *netdev_) > > +{ > > + struct netdev_linux *netdev = netdev_linux_cast(netdev_); > > + > > + /* Note: tc is by-passed when using drv-mode, but when using > > + * skb-mode, we might need to clean up tc. */ > > + > > + xsk_destroy_all(netdev_); > > + ovs_mutex_destroy(&netdev->mutex); > > +} > > + > > +int > > +netdev_afxdp_get_stats(const struct netdev *netdev_, > > You don't need an underscore here. > > > + struct netdev_stats *stats) > > +{ > > + struct netdev_linux *dev = netdev_linux_cast(netdev_); > > + struct netdev_stats dev_stats; > > + struct xsk_socket_info *xsk; > > + int error, i; > > + > > + ovs_mutex_lock(&dev->mutex); > > + > > + error = get_stats_via_netlink(netdev_, &dev_stats); > > + if (error) { > > + VLOG_WARN_RL(&rl, "Error getting AF_XDP statistics"); > > + } else { > > + /* Use kernel netdev's packet and byte counts */ > > + stats->rx_packets = dev_stats.rx_packets; > > + stats->rx_bytes = dev_stats.rx_bytes; > > + stats->tx_packets = dev_stats.tx_packets; > > + stats->tx_bytes = dev_stats.tx_bytes; > > + > > + stats->rx_errors += dev_stats.rx_errors; > > + stats->tx_errors += dev_stats.tx_errors; > > + stats->rx_dropped += dev_stats.rx_dropped; > > + stats->tx_dropped += dev_stats.tx_dropped; > > + stats->multicast += dev_stats.multicast; > > + stats->collisions += dev_stats.collisions; > > + stats->rx_length_errors += dev_stats.rx_length_errors; > > + stats->rx_over_errors += dev_stats.rx_over_errors; > > + stats->rx_crc_errors += dev_stats.rx_crc_errors; > > + stats->rx_frame_errors += dev_stats.rx_frame_errors; > > + stats->rx_fifo_errors += dev_stats.rx_fifo_errors; > > + stats->rx_missed_errors += dev_stats.rx_missed_errors; > > + stats->tx_aborted_errors += dev_stats.tx_aborted_errors; > > + stats->tx_carrier_errors += dev_stats.tx_carrier_errors; > > + stats->tx_fifo_errors += dev_stats.tx_fifo_errors; > > + stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors; > > + stats->tx_window_errors += dev_stats.tx_window_errors; > > + > > + /* Account the dropped in each xsk */ > > + for (i = 0; i < MAX_XSKQ; i++) { > > i < netdev_n_rxq(netdev) > > > + xsk = dev->xsk[i]; > > + if (xsk) { > > + stats->rx_dropped += xsk->rx_dropped; > > + stats->tx_dropped += xsk->tx_dropped; > > + } > > + } > > + } > > + ovs_mutex_unlock(&dev->mutex); > > + > > + return error; > > +} > > diff --git a/lib/netdev-afxdp.h b/lib/netdev-afxdp.h > > new file mode 100644 > > index 000000000000..dd2dc1a2064d > > --- /dev/null > > +++ b/lib/netdev-afxdp.h > > @@ -0,0 +1,74 @@ > > +/* > > + * Copyright (c) 2018, 2019 Nicira, Inc. > > + * > > + * Licensed under the Apache License, Version 2.0 (the "License"); > > + * you may not use this file except in compliance with the License. > > + * You may obtain a copy of the License at: > > + * > > + * http://www.apache.org/licenses/LICENSE-2.0 > > + * > > + * Unless required by applicable law or agreed to in writing, software > > + * distributed under the License is distributed on an "AS IS" BASIS, > > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > > + * See the License for the specific language governing permissions and > > + * limitations under the License. > > + */ > > + > > +#ifndef NETDEV_AFXDP_H > > +#define NETDEV_AFXDP_H 1 > > + > > +#include <config.h> > > + > > +#ifdef HAVE_AF_XDP > > + > > +#include <stdint.h> > > +#include <stdbool.h> > > + > > +/* These functions are Linux AF_XDP specific, so they should be used > > directly > > + * only by Linux-specific code. */ > > + > > +#define MAX_XSKQ 16 > > + > > +struct netdev; > > +struct xsk_socket_info; > > +struct xdp_umem; > > +struct dp_packet_batch; > > +struct smap; > > +struct dp_packet; > > +struct netdev_rxq; > > +struct netdev_stats; > > + > > +int netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_); > > +void netdev_afxdp_destruct(struct netdev *netdev_); > > + > > +int netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, > > + struct dp_packet_batch *batch, > > + int *qfill); > > +int netdev_afxdp_batch_send(struct netdev *netdev_, int qid, > > + struct dp_packet_batch *batch, > > + bool concurrent_txq); > > +int netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args, > > + char **errp); > > +int netdev_afxdp_get_config(const struct netdev *netdev, struct smap > > *args); > > +int netdev_afxdp_get_numa_id(const struct netdev *netdev); > > +int netdev_afxdp_get_stats(const struct netdev *netdev_, > > + struct netdev_stats *stats); > > + > > +void free_afxdp_buf(struct dp_packet *p); > > +int netdev_afxdp_reconfigure(struct netdev *netdev); > > +void signal_remove_xdp(struct netdev *netdev); > > + > > +#else /* !HAVE_AF_XDP */ > > + > > +#include "openvswitch/compiler.h" > > + > > +struct dp_packet; > > + > > +static inline void > > +free_afxdp_buf(struct dp_packet *p OVS_UNUSED) > > +{ > > + /* Nothing */ > > +} > > + > > +#endif /* HAVE_AF_XDP */ > > +#endif /* netdev-afxdp.h */ > > diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h > > new file mode 100644 > > index 000000000000..d43f79e6aa41 > > --- /dev/null > > +++ b/lib/netdev-linux-private.h > > @@ -0,0 +1,139 @@ > > +/* > > + * Copyright (c) 2019 Nicira, Inc. > > + * > > + * Licensed under the Apache License, Version 2.0 (the "License"); > > + * you may not use this file except in compliance with the License. > > + * You may obtain a copy of the License at: > > + * > > + * http://www.apache.org/licenses/LICENSE-2.0 > > + * > > + * Unless required by applicable law or agreed to in writing, software > > + * distributed under the License is distributed on an "AS IS" BASIS, > > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > > + * See the License for the specific language governing permissions and > > + * limitations under the License. > > + */ > > + > > +#ifndef NETDEV_LINUX_PRIVATE_H > > +#define NETDEV_LINUX_PRIVATE_H 1 > > + > > +#include <config.h> > > + > > +#include <linux/filter.h> > > +#include <linux/gen_stats.h> > > +#include <linux/if_ether.h> > > +#include <linux/if_tun.h> > > +#include <linux/types.h> > > +#include <linux/ethtool.h> > > +#include <linux/mii.h> > > +#include <stdint.h> > > +#include <stdbool.h> > > + > > +#include "netdev-afxdp.h" > > +#include "netdev-provider.h" > > +#include "netdev-tc-offloads.h" > > +#include "netdev-vport.h" > > +#include "openvswitch/thread.h" > > +#include "ovs-atomic.h" > > +#include "timer.h" > > +#include "xdpsock.h" > > + > > +/* These functions are Linux specific, so they should be used directly > > only by > > + * Linux-specific code. */ > > + > > +struct netdev; > > + > > +struct netdev_rxq_linux { > > + struct netdev_rxq up; > > + bool is_tap; > > + int fd; > > +}; > > + > > +void netdev_linux_run(const struct netdev_class *); > > + > > +int netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag, > > + const char *flag_name, bool enable); > > + > > +int get_stats_via_netlink(const struct netdev *netdev_, > > + struct netdev_stats *stats); > > + > > +struct netdev_linux { > > + struct netdev up; > > + > > + /* Protects all members below. */ > > + struct ovs_mutex mutex; > > + > > + unsigned int cache_valid; > > + > > + bool miimon; /* Link status of last poll. */ > > + long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. > > */ > > + struct timer miimon_timer; > > + > > + int netnsid; /* Network namespace ID. */ > > + /* The following are figured out "on demand" only. They are only valid > > + * when the corresponding VALID_* bit in 'cache_valid' is set. */ > > + int ifindex; > > + struct eth_addr etheraddr; > > + int mtu; > > + unsigned int ifi_flags; > > + long long int carrier_resets; > > + uint32_t kbits_rate; /* Policing data. */ > > + uint32_t kbits_burst; > > + int vport_stats_error; /* Cached error code from > > vport_get_stats(). > > + 0 or an errno value. */ > > + int netdev_mtu_error; /* Cached error code from SIOCGIFMTU > > + * or SIOCSIFMTU. > > + */ > > + int ether_addr_error; /* Cached error code from set/get > > etheraddr. */ > > + int netdev_policing_error; /* Cached error code from set policing. */ > > + int get_features_error; /* Cached error code from ETHTOOL_GSET. */ > > + int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */ > > + > > + enum netdev_features current; /* Cached from ETHTOOL_GSET. */ > > + enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */ > > + enum netdev_features supported; /* Cached from ETHTOOL_GSET. */ > > + > > + struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */ > > + struct tc *tc; > > + > > + /* For devices of class netdev_tap_class only. */ > > + int tap_fd; > > + bool present; /* If the device is present in the > > namespace */ > > + uint64_t tx_dropped; /* tap device can drop if the iface is > > down */ > > + > > + /* LAG information. */ > > + bool is_lag_master; /* True if the netdev is a LAG master. */ > > + > > + /* AF_XDP information */ > > +#ifdef HAVE_AF_XDP > > + struct xsk_socket_info *xsk[MAX_XSKQ]; > > You may allocate this array dynamically based on the n_rxq while performing > reconfiguration. This way you will also have no limit on the number of rxqs. make sense, thanks. <snip> > > --- /dev/null > > +++ b/lib/spinlock.h > > @@ -0,0 +1,70 @@ > > +/* > > + * Copyright (c) 2018, 2019 Nicira, Inc. > > + * > > + * Licensed under the Apache License, Version 2.0 (the "License"); > > + * you may not use this file except in compliance with the License. > > + * You may obtain a copy of the License at: > > + * > > + * http://www.apache.org/licenses/LICENSE-2.0 > > + * > > + * Unless required by applicable law or agreed to in writing, software > > + * distributed under the License is distributed on an "AS IS" BASIS, > > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > > + * See the License for the specific language governing permissions and > > + * limitations under the License. > > + */ > > +#ifndef SPINLOCK_H > > +#define SPINLOCK_H 1 > > + > > +#include <config.h> > > + > > +#include <ctype.h> > > +#include <errno.h> > > +#include <fcntl.h> > > +#include <stdarg.h> > > +#include <stdlib.h> > > +#include <unistd.h> > > + > > +#include "ovs-atomic.h" > > + > > +typedef struct { > > It's probably better to not use 'typedef'. OVS doesn't use > typedefs for structures, unions and enums usually. > For example we have no typedef for 'struct ovs_mutex'. > So, this should be just 'struct ovs_spinlock'. > > > We may also add some annotations like OVS_LOCKABLE and clang > thread safety annotations: OVS_ACQUIRES, OVS_TRY_LOCK, OVS_RELEASES. > However, this could be done later. > OK, will do it. > > + atomic_int locked; > > +} ovs_spinlock_t;> + > > +static inline void > > +ovs_spinlock_init(ovs_spinlock_t *sl) > > +{ > > + atomic_init(&sl->locked, 0); > > +} > > + > > +static inline void > > +ovs_spin_lock(ovs_spinlock_t *sl) > > +{ > > + int exp = 0, locked = 0; > > + > > + while (!atomic_compare_exchange_strong_explicit(&sl->locked, &exp, 1, > > + memory_order_acquire, > > + memory_order_relaxed)) { > > + locked = 1; > > + while (locked) { > > + atomic_read_relaxed(&sl->locked, &locked); > > + } > > + exp = 0; > > + } > > +} > > + > > +static inline void > > +ovs_spin_unlock(ovs_spinlock_t *sl) > > +{ > > + atomic_store_explicit(&sl->locked, 0, memory_order_release); > > +} > > + > > +static inline int OVS_UNUSED > > Not sure that we need UNUSED annotation since we're in header now. > > > +ovs_spin_trylock(ovs_spinlock_t *sl) > > +{ > > + int exp = 0; > > + return atomic_compare_exchange_strong_explicit(&sl->locked, &exp, 1, > > + memory_order_acquire, > > + memory_order_relaxed); > > +} > > +#endif > > diff --git a/lib/util.c b/lib/util.c > > index 5679232ffc5f..060b1e287bce 100644 > > --- a/lib/util.c > > +++ b/lib/util.c > > @@ -277,6 +277,49 @@ free_cacheline(void *p) > > #endif > > } > > > > +#ifdef HAVE_AF_XDP > > I don't think that we need 'ifdef' here. > > How about re-naming 'xmalloc_cacheline' to 'xmalloc_size_align' > making it allocate memory aligned to a specified size and in > a dedicated cachelines? > > And implement two functions: > xmalloc_cacheline(size) > { > return xmalloc_size_align(size, CACHE_LINE_SIZE); > } > xmalloc_pagealign(size) > { > return xmalloc_size_align(size, get_page_size()); > } > I think it's better, will do it. > > +void * > > +xmalloc_pagealign(size_t size) > > +{ > > +#ifdef HAVE_POSIX_MEMALIGN > > + void *p; > > + int error; > > + > > + COVERAGE_INC(util_xalloc); > > + error = posix_memalign(&p, get_page_size(), size ? size : 1); > > + if (error != 0) { > > + out_of_memory(); > > + } > > + return p; > > +#else > > + /* Similar to xmalloc_cacheline, but replace > > + * CACHE_LINE_SIZE with get_page_size() */ > > + void *p = xmalloc((get_page_size() - 1) > > + + sizeof(void *) > > + + ROUND_UP(size, get_page_size())); > > I think that you don't need to round up to a page size. > You need to round up to a CACHE_LINE_SIZE, probably. > There is no point to allocate so much memory more. > Below code should be re-checked too. > Right, in worst case we should waste (page_size() - 1) bytes. > > + bool runt = PAD_SIZE((uintptr_t) p, get_page_size()) < sizeof(void *); > > + void *r = (void *) ROUND_UP((uintptr_t) p + (runt ? get_page_size() : > > 0), > > + get_page_size()); > > + void **q = (void **) r - 1; > > + *q = p; > > + return r; > > +#endif > > +} > > + > > +void > > +free_pagealign(void *p) > > +{ > > +#ifdef HAVE_POSIX_MEMALIGN > > + free(p); > > +#else > > + if (p) { > > + void **q = (void **) p - 1; > > + free(*q); > > + } > > +#endif > > +} > > +#endif > > + > > char * > > xasprintf(const char *format, ...) > > { > > diff --git a/lib/util.h b/lib/util.h > > index 53354f1c6f0f..3cd8cf87fba8 100644 > > --- a/lib/util.h > > +++ b/lib/util.h > > @@ -163,6 +163,11 @@ void ovs_strzcpy(char *dst, const char *src, size_t > > size); > > > > int string_ends_with(const char *str, const char *suffix); > > > > +#ifdef HAVE_AF_XDP > > +void *xmalloc_pagealign(size_t) MALLOC_LIKE; > > +void free_pagealign(void *); > > +#endif > > + > > /* The C standards say that neither the 'dst' nor 'src' argument to > > * memcpy() may be null, even if 'n' is zero. This wrapper tolerates > > * the null case. */ > > diff --git a/lib/xdpsock.c b/lib/xdpsock.c > > new file mode 100644 > > index 000000000000..ffdb54dfcd27 > > --- /dev/null > > +++ b/lib/xdpsock.c > > @@ -0,0 +1,179 @@ > > +/* > > + * Copyright (c) 2018, 2019 Nicira, Inc. > > + * > > + * Licensed under the Apache License, Version 2.0 (the "License"); > > + * you may not use this file except in compliance with the License. > > + * You may obtain a copy of the License at: > > + * > > + * http://www.apache.org/licenses/LICENSE-2.0 > > + * > > + * Unless required by applicable law or agreed to in writing, software > > + * distributed under the License is distributed on an "AS IS" BASIS, > > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > > + * See the License for the specific language governing permissions and > > + * limitations under the License. > > + */ > > +#include <config.h> > > + > > +#include "xdpsock.h" > > +#include "dp-packet.h" > > +#include "openvswitch/compiler.h" > > + > > +/* Note: > > + * umem_elem_push* shouldn't overflow because we always pop > > + * elem first, then push back to the stack. > > + */ > > +static inline void > > +__umem_elem_push_n(struct umem_pool *umemp, int n, void **addrs) > > +{ > > + void *ptr; > > + > > + if (OVS_UNLIKELY(umemp->index + n > umemp->size)) { > > + OVS_NOT_REACHED(); > > + } > > + > > + ptr = &umemp->array[umemp->index]; > > + memcpy(ptr, addrs, n * sizeof(void *)); > > + umemp->index += n; > > +} > > + > > +void umem_elem_push_n(struct umem_pool *umemp, int n, void **addrs) > > +{ > > + ovs_spin_lock(&umemp->mutex); > > + __umem_elem_push_n(umemp, n, addrs); > > + ovs_spin_unlock(&umemp->mutex); > > +} > > + > > +static inline void > > +__umem_elem_push(struct umem_pool *umemp, void *addr) > > +{ > > + if (OVS_UNLIKELY(umemp->index + 1) > umemp->size) { > > + OVS_NOT_REACHED(); > > + } > > + > > + umemp->array[umemp->index++] = addr; > > +} > > + > > +void > > +umem_elem_push(struct umem_pool *umemp, void *addr) > > +{ > > + > > + ovs_assert(((uint64_t)addr & FRAME_SHIFT_MASK) == 0); > > + > > + ovs_spin_lock(&umemp->mutex); > > + __umem_elem_push(umemp, addr); > > + ovs_spin_unlock(&umemp->mutex); > > +} > > + > > +static inline int > > +__umem_elem_pop_n(struct umem_pool *umemp, int n, void **addrs) > > +{ > > + void *ptr; > > + > > + if (OVS_UNLIKELY(umemp->index - n < 0)) { > > + return -ENOMEM; > > + } > > + > > + umemp->index -= n; > > + ptr = &umemp->array[umemp->index]; > > + memcpy(addrs, ptr, n * sizeof(void *)); > > + > > + return 0; > > +} > > + > > +int > > +umem_elem_pop_n(struct umem_pool *umemp, int n, void **addrs) > > +{ > > + int ret; > > + > > + ovs_spin_lock(&umemp->mutex); > > + ret = __umem_elem_pop_n(umemp, n, addrs); > > + ovs_spin_unlock(&umemp->mutex); > > + > > + return ret; > > +} > > + > > +static inline void * > > +__umem_elem_pop(struct umem_pool *umemp) > > +{ > > + if (OVS_UNLIKELY(umemp->index - 1 < 0)) { > > + return NULL; > > + } > > + > > + return umemp->array[--umemp->index]; > > +} > > + > > +void * > > +umem_elem_pop(struct umem_pool *umemp) > > +{ > > + void *ptr; > > + > > + ovs_spin_lock(&umemp->mutex); > > + ptr = __umem_elem_pop(umemp); > > + ovs_spin_unlock(&umemp->mutex); > > + > > + return ptr; > > +} > > + > > +static void ** > > +__umem_pool_alloc(unsigned int size) > > +{ > > + void *bufs; > > + int ret; > > + > > + ret = posix_memalign(&bufs, getpagesize(), > > + size * sizeof(void *)); > > xmalloc_pagealign ? > > > + if (ret) { > > + return NULL; > > + } > > + > > + memset(bufs, 0, size * sizeof(void *)); > > + return (void **)bufs; > > +} > > + > > +int > > +umem_pool_init(struct umem_pool *umemp, unsigned int size) > > +{ > > + umemp->array = __umem_pool_alloc(size); > > + if (!umemp->array) { > > + return -ENOMEM; > > + } > > + > > + umemp->size = size; > > + umemp->index = 0; > > + ovs_spinlock_init(&umemp->mutex); > > + return 0; > > +} > > + > > +void > > +umem_pool_cleanup(struct umem_pool *umemp) > > +{ > > + free(umemp->array); > > free_pagealign ? > > > + umemp->array = NULL; > > +} > > + > > +/* AF_XDP metadata init/destroy */ > > +int > > +xpacket_pool_init(struct xpacket_pool *xp, unsigned int size) > > +{ > > + void *bufs; > > + int ret; > > + > > + ret = posix_memalign(&bufs, getpagesize(), > > + size * sizeof(struct dp_packet_afxdp)); > > + if (ret) { > > + return -ENOMEM; > > + } > > + memset(bufs, 0, size * sizeof(struct dp_packet_afxdp)); > > + > > + xp->array = bufs; > > + xp->size = size; > > + return 0; > > +} > > + > > +void > > +xpacket_pool_cleanup(struct xpacket_pool *xp) > > +{ > > + free(xp->array); > > + xp->array = NULL; > > +} > > diff --git a/lib/xdpsock.h b/lib/xdpsock.h > > new file mode 100644 > > index 000000000000..72578e383812 > > --- /dev/null > > +++ b/lib/xdpsock.h > > @@ -0,0 +1,101 @@ > > +/* > > + * Copyright (c) 2018, 2019 Nicira, Inc. > > + * > > + * Licensed under the Apache License, Version 2.0 (the "License"); > > + * you may not use this file except in compliance with the License. > > + * You may obtain a copy of the License at: > > + * > > + * http://www.apache.org/licenses/LICENSE-2.0 > > + * > > + * Unless required by applicable law or agreed to in writing, software > > + * distributed under the License is distributed on an "AS IS" BASIS, > > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > > + * See the License for the specific language governing permissions and > > + * limitations under the License. > > + */ > > + > > +#ifndef XDPSOCK_H > > +#define XDPSOCK_H 1 > > + > > +#include <config.h> > > + > > +#ifdef HAVE_AF_XDP > > + > > +#include <bpf/xsk.h> > > +#include <errno.h> > > +#include <stdbool.h> > > +#include <stdio.h> > > + > > +#include "openvswitch/thread.h" > > +#include "ovs-atomic.h" > > +#include "spinlock.h" > > + > > +#define FRAME_HEADROOM XDP_PACKET_HEADROOM > > +#define FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE > > +#define FRAME_SHIFT XSK_UMEM__DEFAULT_FRAME_SHIFT > > +#define FRAME_SHIFT_MASK ((1 << FRAME_SHIFT) - 1) > > + > > +#define PROD_NUM_DESCS XSK_RING_PROD__DEFAULT_NUM_DESCS > > +#define CONS_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS > > + > > +/* The worst case is all 4 queues TX/CQ/RX/FILL are full. > > + * Setting NUM_FRAMES to this makes sure umem_pop always successes. > > + */ > > +#define NUM_FRAMES (2 * (PROD_NUM_DESCS + CONS_NUM_DESCS)) > > + > > +#define BATCH_SIZE NETDEV_MAX_BURST > > + > > +BUILD_ASSERT_DECL(IS_POW2(NUM_FRAMES)); > > +BUILD_ASSERT_DECL(PROD_NUM_DESCS == CONS_NUM_DESCS); > > +BUILD_ASSERT_DECL(NUM_FRAMES == 2 * (PROD_NUM_DESCS + CONS_NUM_DESCS)); > > + > > +/* LIFO ptr_array */ > > +struct umem_pool { > > + int index; /* point to top */ > > + unsigned int size; > > + ovs_spinlock_t mutex; > > It's a bit confusing to name it a 'mutex'. Sounds like it's 'ovs_mutex'. > Probably, it'll be better to name it 'spinlock' or just 'lock'. > OK <snip> Regards, William _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
