[dpdk-dev] [PATCH] vhost: optimize vhost memcpy

2016-12-02 Thread Zhihong Wang
This patch optimizes Vhost performance for large packets when the
Mergeable Rx buffer feature is enabled. It introduces a dedicated
memcpy function for vhost enqueue/dequeue to replace rte_memcpy.

The reason is that rte_memcpy is for general cases, it handles
unaligned copies and make store aligned, it even makes load aligned
for micro architectures like Ivy Bridge. However alignment handling
comes at a price: It introduces extra load/store instructions.

Vhost memcpy is rather special: The copy is aligned, and remote,
and there is header write along which is also remote. In this case
the memcpy instruction stream should be simplified, to reduce extra
load/store, therefore reduce the probability of load/store buffer
full caused pipeline stall, to let the actual memcpy instructions
be issued and let H/W prefetcher goes to work as early as possible.

Performance gain is visible when packet size:

 1. Larger than 512 bytes on AVX/SSE platforms like Ivy Bridge

 2. Larger than 256 bytes on AVX2 platforms like Haswell

 3. Larger than 512 bytes on AVX512 platforms like Skylake

Up to 20% gain can be achieved by this patch for PVP traffic. The
test can also be conducted without NIC, by using loopback traffic
between Vhost and Virtio. For example, increase TXONLY_DEF_PACKET_LEN
to the requested packet size in testpmd.h, rebuild and start testpmd
in both host and guest, then "start" on one side and "start tx_first 32"
on the other.


Signed-off-by: Zhihong Wang <zhihong.w...@intel.com>
---
 lib/librte_vhost/virtio_net.c | 72 +--
 1 file changed, 69 insertions(+), 3 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 595f67c..cd6f21a 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -50,6 +50,72 @@
 #define MAX_PKT_BURST 32
 #define VHOST_LOG_PAGE 4096
 
+/**
+ * This function is used to for vhost memcpy, to replace rte_memcpy.
+ * The reason is that rte_memcpy is for general cases, where vhost
+ * memcpy is a rather special case: The copy is aligned, and remote,
+ * and there is header write along which is also remote. In this case
+ * the memcpy instruction stream should be simplified to reduce extra
+ * load/store, therefore reduce the probability of load/store buffer
+ * full caused pipeline stall, to let the actual memcpy instructions
+ * be issued and let H/W prefetcher goes to work as early as possible.
+ */
+static inline void __attribute__((always_inline))
+vhost_memcpy(void *dst, const void *src, size_t n)
+{
+   /* Copy size <= 16 bytes */
+   if (n < 16) {
+   if (n & 0x01) {
+   *(uint8_t *)dst = *(const uint8_t *)src;
+   src = (const uint8_t *)src + 1;
+   dst = (uint8_t *)dst + 1;
+   }
+   if (n & 0x02) {
+   *(uint16_t *)dst = *(const uint16_t *)src;
+   src = (const uint16_t *)src + 1;
+   dst = (uint16_t *)dst + 1;
+   }
+   if (n & 0x04) {
+   *(uint32_t *)dst = *(const uint32_t *)src;
+   src = (const uint32_t *)src + 1;
+   dst = (uint32_t *)dst + 1;
+   }
+   if (n & 0x08)
+   *(uint64_t *)dst = *(const uint64_t *)src;
+
+   return;
+   }
+
+   /* Copy 16 <= size <= 32 bytes */
+   if (n <= 32) {
+   rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+   rte_mov16((uint8_t *)dst - 16 + n,
+   (const uint8_t *)src - 16 + n);
+
+   return;
+   }
+
+   /* Copy 32 < size <= 64 bytes */
+   if (n <= 64) {
+   rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+   rte_mov32((uint8_t *)dst - 32 + n,
+   (const uint8_t *)src - 32 + n);
+
+   return;
+   }
+
+   /* Copy 64 bytes blocks */
+   for (; n >= 64; n -= 64) {
+   rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+   dst = (uint8_t *)dst + 64;
+   src = (const uint8_t *)src + 64;
+   }
+
+   /* Copy whatever left */
+   rte_mov64((uint8_t *)dst - 64 + n,
+   (const uint8_t *)src - 64 + n);
+}
+
 static inline void __attribute__((always_inline))
 vhost_log_page(uint8_t *log_base, uint64_t page)
 {
@@ -246,7 +312,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc 
*descs,
}
 
cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-   rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
+   vhost_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
cpy_len);
vhos

[dpdk-dev] [PATCH v5 6/6] vhost: optimize cache access

2016-09-09 Thread Zhihong Wang
This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.

Signed-off-by: Zhihong Wang 
---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 lib/librte_vhost/virtio_net.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index e9f6353..0086bcb 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -197,6 +197,7 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
uint32_t mbuf_len;
uint32_t mbuf_avail;
uint32_t cpy_len;
+   uint32_t copy_virtio_hdr;
uint32_t num_buffers = 0;

/* start with the first mbuf of the packet */
@@ -211,12 +212,12 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
if (unlikely(!desc_addr))
goto error;

-   /* handle virtio header */
+   /*
+* handle virtio header, the actual write operation is delayed
+* for cache optimization, to reduce CPU pipeline stall cycles.
+*/
virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
-   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
-   if (is_mrg_rxbuf)
-   virtio_hdr->num_buffers = 1;
-
+   copy_virtio_hdr = 1;
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
desc_offset = dev->vhost_hlen;
@@ -266,8 +267,15 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
goto error;
}

-   /* copy mbuf data */
+   /* copy virtio header and mbuf data */
cpy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+   if (copy_virtio_hdr) {
+   copy_virtio_hdr = 0;
+   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+   if (is_mrg_rxbuf)
+   virtio_hdr->num_buffers = num_buffers + 1;
+   }
+
rte_memcpy((void *)(uintptr_t)desc_addr,
rte_pktmbuf_mtod_offset(mbuf, void *,
mbuf_len - mbuf_avail),
-- 
2.7.4



[dpdk-dev] [PATCH v5 5/6] vhost: batch update used ring

2016-09-09 Thread Zhihong Wang
This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang 
---
Changes in v4:

 1. Free shadow used ring in the right place.

 2. Add failure check for shadow used ring malloc.

 lib/librte_vhost/vhost.c  | 20 --
 lib/librte_vhost/vhost.h  |  4 +++
 lib/librte_vhost/vhost_user.c | 31 +
 lib/librte_vhost/virtio_net.c | 64 +++
 4 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..cb31cdd 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 free_device(struct virtio_net *dev)
 {
+   struct vhost_virtqueue *vq_0;
+   struct vhost_virtqueue *vq_1;
uint32_t i;

-   for (i = 0; i < dev->virt_qp_nb; i++)
-   rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+   for (i = 0; i < dev->virt_qp_nb; i++) {
+   vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
+   if (vq_0->shadow_used_ring) {
+   rte_free(vq_0->shadow_used_ring);
+   vq_0->shadow_used_ring = NULL;
+   }
+
+   vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
+   if (vq_1->shadow_used_ring) {
+   rte_free(vq_1->shadow_used_ring);
+   vq_1->shadow_used_ring = NULL;
+   }
+
+   /* malloc together, free together */
+   rte_free(vq_0);
+   }

rte_free(dev);
 }
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 9707dfc..381dc27 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_tlog_guest_addr;
+
+   /* Shadow used ring for performance */
+   struct vring_used_elem  *shadow_used_ring;
+   uint32_tshadow_used_idx;
 } __rte_cache_aligned;

 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index eee99e9..d7cf1ed 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -193,7 +193,21 @@ static int
 vhost_user_set_vring_num(struct virtio_net *dev,
 struct vhost_vring_state *state)
 {
-   dev->virtqueue[state->index]->size = state->num;
+   struct vhost_virtqueue *vq;
+
+   vq = dev->virtqueue[state->index];
+   vq->size = state->num;
+   if (!vq->shadow_used_ring) {
+   vq->shadow_used_ring = rte_malloc(NULL,
+   vq->size * sizeof(struct vring_used_elem),
+   RTE_CACHE_LINE_SIZE);
+   if (!vq->shadow_used_ring) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "Failed to allocate memory"
+   " for shadow used ring.\n");
+   return -1;
+   }
+   }

return 0;
 }
@@ -611,14 +625,21 @@ static int
 vhost_user_get_vring_base(struct virtio_net *dev,
  struct vhost_vring_state *state)
 {
+   struct vhost_virtqueue *vq;
+
/* We have to stop the queue (virtio) if it is running. */
if (dev->flags & VIRTIO_DEV_RUNNING) {
dev->flags &= ~VIRTIO_DEV_RUNNING;
notify_ops->destroy_device(dev->vid);
}

+   vq = dev->virtqueue[state->index];
/* Here we are safe to get the last used index */
-   state->num = dev->virtqueue[state->index]->last_used_idx;
+   state->num = vq->last_used_idx;
+   if (vq->shadow_used_ring) {
+   rte_free(vq->shadow_used_ring);
+   vq->shadow_used_ring = NULL;
+   }

RTE_LOG(INFO, VHOST_CONFIG,
"vring base idx:%d file:%d\n", state->index, state->num);
@@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net *dev,
 * sent and only sent in vhost_vring_stop.
 * TODO: cleanup the vring, it isn't usable since here.
 */
-   if (dev->virtqueue[state->index]->kickfd >= 0)
-   close(dev->virtqueue[state->index]->kickfd);
+   if (vq->kickfd >= 0)
+   close(vq->kickfd);

-   dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+   vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;

return 0;
 }
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index b38f18f..e9f6353 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -134,17 +134,52 

[dpdk-dev] [PATCH v5 4/6] vhost: add desc prefetch

2016-09-09 Thread Zhihong Wang
This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/virtio_net.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 6f63968..b38f18f 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -302,6 +302,12 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
/* start enqueuing packets 1 by 1 */
avail_idx = *((volatile uint16_t *)>avail->idx);
while (pkt_left && avail_idx != vq->last_used_idx) {
+   /* prefetch the next desc */
+   if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+   rte_prefetch0(>desc[vq->avail->ring[
+   (vq->last_used_idx + 1) &
+   (vq->size - 1)]]);
+
if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
is_mrg_rxbuf))
break;
-- 
2.7.4



[dpdk-dev] [PATCH v5 3/6] vhost: remove useless volatile

2016-09-09 Thread Zhihong Wang
This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index c2dfc3c..9707dfc 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_tsize;

/* Last index used on the available ring */
-   volatile uint16_t   last_used_idx;
+   uint16_tlast_used_idx;
 #define VIRTIO_INVALID_EVENTFD (-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD   (-2)

-- 
2.7.4



[dpdk-dev] [PATCH v5 2/6] vhost: rewrite enqueue

2016-09-09 Thread Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.

Signed-off-by: Zhihong Wang 
---
Changes in v5:

 1. Rebase to the latest branch.

 2. Rename variables to keep consistent in naming style.

 3. Small changes like return value adjustment and vertical alignment.

---
Changes in v4:

 1. Refactor the code for clearer logic.

 2. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Rewrite enqueue and delete the obsolete in the same patch.

 lib/librte_vhost/virtio_net.c | 514 --
 1 file changed, 138 insertions(+), 376 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0d6e7d9..6f63968 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t 
qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }

-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
cksum));
break;
}
+   } else {
+   net_hdr->flags   = 0;
+   net_hdr->csum_start  = 0;
+   net_hdr->csum_offset = 0;
}

if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,439 +126,197 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
net_hdr->gso_size = m_buf->tso_segsz;
net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ m_buf->l4_len;
+   } else {
+   net_hdr->gso_type = 0;
+   net_hdr->hdr_len  = 0;
+   net_hdr->gso_size = 0;
}
 }

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-   struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint32_t desc_chain_head, uint32_t desc_chain_len)
 {
-   if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-   *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-   else
-   *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+   uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
+
+   vq->used->ring[used_idx].id = desc_chain_head;
+   vq->used->ring[used_idx].len = desc_chain_len;
+   vq->last_used_idx++;
+   vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+   ring[used_idx]),
+   sizeof(vq->used->ring[used_idx]));
 }

 static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint16_t avail_idx, struct rte_mbuf *mbuf,
+   uint32_t is_mrg_rxbuf)
 {
-   uint32_t desc_avail, desc_offset;
-   uint32_t mbuf_avail, mbuf_offset;
-   uint32_t cpy_len;
+   struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
uint64_t desc_addr;
-   struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+   uint32_t desc_chain_head;
+   uint32_t desc_chain_len;
+   uint32_t desc_current;
+   uint32_t desc_offset;
+   uint32_t mbuf_len;
+   uint32_t mbuf_avail;
+   uint32_t cpy_len;
+   uint32_t num_buffers = 0;

-   desc = >desc[desc_idx];
+   /* start with the first mbuf of the packet */
+   mbuf_len = rte_pktmbuf_data_len(mbuf);
+   mbuf_avail = mbuf_len;
+
+   /* get the current desc */
+   desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+   desc_chain_head = desc_current;
+   desc = >desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
-   /*
-* Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-* performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-* otherwise stores offset on the stack instead of in a register.
-*/
-   if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-   return -1;
+   if (unlikely(!desc_addr))
+   goto error;

-   rte_prefetch0((void *)(

[dpdk-dev] [PATCH v5 1/6] vhost: fix windows vm hang

2016-09-09 Thread Zhihong Wang
This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code
which causes the guest to hang once any packets are enqueued when mrg_rxbuf
is turned on by setting the right id and len in the used ring.

As defined in virtio spec 0.95 and 1.0, in each used ring element, id means
index of start of used descriptor chain, and len means total length of the
descriptor chain which was written to. While in 16.07 code, index of the
last descriptor is assigned to id, and the length of the last descriptor is
assigned to len.

How to test?

 1. Start testpmd in the host with a vhost port.

 2. Start a Windows VM image with qemu and connect to the vhost port.

 3. Start io forwarding with tx_first in host testpmd.

For 16.07 code, the Windows VM will hang once any packets are enqueued.

Cc: 
Signed-off-by: Zhihong Wang 
---
Changes in v5:

 1. Add details in commit log.

 lib/librte_vhost/virtio_net.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8a151af..0d6e7d9 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
uint16_t start_idx = vq->last_used_idx;
uint16_t cur_idx = start_idx;
uint64_t desc_addr;
+   uint32_t desc_chain_head;
+   uint32_t desc_chain_len;
uint32_t mbuf_offset, mbuf_avail;
uint32_t desc_offset, desc_avail;
uint32_t cpy_len;
@@ -412,6 +414,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct 
vhost_virtqueue *vq,

desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
desc_offset = dev->vhost_hlen;
+   desc_chain_head = buf_vec[vec_idx].desc_idx;
+   desc_chain_len = desc_offset;

mbuf_avail  = rte_pktmbuf_data_len(m);
mbuf_offset = 0;
@@ -419,19 +423,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, 
struct vhost_virtqueue *vq,
/* done with current desc buf, get the next one */
if (desc_avail == 0) {
desc_idx = buf_vec[vec_idx].desc_idx;
+   vec_idx++;

if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
/* Update used ring with desc information */
used_idx = cur_idx++ & (vq->size - 1);
-   vq->used->ring[used_idx].id  = desc_idx;
-   vq->used->ring[used_idx].len = desc_offset;
+   vq->used->ring[used_idx].id = desc_chain_head;
+   vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used,
 ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
+   desc_chain_head = buf_vec[vec_idx].desc_idx;
+   desc_chain_len = 0;
}

-   vec_idx++;
desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
if (unlikely(!desc_addr))
return 0;
@@ -463,11 +469,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, 
struct vhost_virtqueue *vq,
mbuf_offset += cpy_len;
desc_avail  -= cpy_len;
desc_offset += cpy_len;
+   desc_chain_len += cpy_len;
}

used_idx = cur_idx & (vq->size - 1);
-   vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-   vq->used->ring[used_idx].len = desc_offset;
+   vq->used->ring[used_idx].id = desc_chain_head;
+   vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used, ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
-- 
2.7.4



[dpdk-dev] [PATCH v5 0/6] vhost: optimize enqueue

2016-09-09 Thread Zhihong Wang
This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

 *  Higher maximum throughput can be achieved for fast frontends like DPDK
virtio pmd.

 *  Better scalability can be achieved that each vhost core can support
more connections because it takes less cycles to handle each single
frontend.

This patch set contains:

 1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.

 2. A baseline patch to rewrite the vhost logic.

 3. A series of optimization patches added upon the baseline.

The main optimization techniques are:

 1. Reorder code to reduce CPU pipeline stall cycles.

 2. Batch update the used ring for better efficiency.

 3. Prefetch descriptor to hide cache latency.

 4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.

Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.

---
Changes in v5:

 1. Rebase to the latest branch.

 2. Rename variables to keep consistent in naming style.

 3. Small changes like return value adjustment and vertical alignment.

 4. Add details in commit log.

---
Changes in v4:

 1. Fix a Windows VM compatibility issue.

 2. Free shadow used ring in the right place.

 3. Add failure check for shadow used ring malloc.

 4. Refactor the code for clearer logic.

 5. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

 1. Split the big function into several small ones.

 2. Use multiple patches to explain each optimization.

 3. Add comments.

Zhihong Wang (6):
  vhost: fix windows vm hang
  vhost: rewrite enqueue
  vhost: remove useless volatile
  vhost: add desc prefetch
  vhost: batch update used ring
  vhost: optimize cache access

 lib/librte_vhost/vhost.c  |  20 +-
 lib/librte_vhost/vhost.h  |   6 +-
 lib/librte_vhost/vhost_user.c |  31 ++-
 lib/librte_vhost/virtio_net.c | 561 +++---
 4 files changed, 242 insertions(+), 376 deletions(-)

-- 
2.7.4



[dpdk-dev] [PATCH v4 6/6] vhost: optimize cache access

2016-08-30 Thread Zhihong Wang
This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_rxtx.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index ddc7b21..fc5dc4a 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -196,6 +196,7 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
uint32_t mbuf_len;
uint32_t mbuf_avail;
uint32_t copy_len;
+   uint32_t copy_virtio_hdr;
uint32_t extra_buffers = 0;

/* start with the first mbuf of the packet */
@@ -210,12 +211,12 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
if (unlikely(!desc_addr))
goto error;

-   /* handle virtio header */
+   /*
+* handle virtio header, the actual write operation is delayed
+* for cache optimization, to reduce CPU pipeline stall cycles.
+*/
virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
-   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
-   if (is_mrg_rxbuf)
-   virtio_hdr->num_buffers = extra_buffers + 1;
-
+   copy_virtio_hdr = 1;
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
desc_offset = dev->vhost_hlen;
@@ -266,8 +267,15 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
goto error;
}

-   /* copy mbuf data */
+   /* copy virtio header and mbuf data */
copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+   if (copy_virtio_hdr) {
+   copy_virtio_hdr = 0;
+   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+   if (is_mrg_rxbuf)
+   virtio_hdr->num_buffers = extra_buffers + 1;
+   }
+
rte_memcpy((void *)(uintptr_t)desc_addr,
rte_pktmbuf_mtod_offset(mbuf, void *,
mbuf_len - mbuf_avail),
-- 
2.7.4



[dpdk-dev] [PATCH v4 5/6] vhost: batch update used ring

2016-08-30 Thread Zhihong Wang
This patch enables batch update of the used ring for better efficiency.

---
Changes in v4:

 1. Free shadow used ring in the right place.

 2. Add failure check for shadow used ring malloc.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost-net.h  |  4 +++
 lib/librte_vhost/vhost_rxtx.c | 62 ---
 lib/librte_vhost/virtio-net.c | 42 ++---
 3 files changed, 95 insertions(+), 13 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 51fdf3d..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_tlog_guest_addr;
+
+   /* Shadow used ring for performance */
+   struct vring_used_elem  *shadow_used_ring;
+   uint32_tshadow_used_idx;
 } __rte_cache_aligned;

 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 927896c..ddc7b21 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -134,16 +134,51 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
 }

 static inline void __attribute__((always_inline))
-update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
-   uint32_t desc_chain_head, uint32_t desc_chain_len)
+update_used_ring(struct vhost_virtqueue *vq, uint32_t desc_chain_head,
+   uint32_t desc_chain_len)
 {
-   uint32_t used_idx_round = vq->last_used_idx & (vq->size - 1);
+   vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+   vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+   vq->shadow_used_idx++;
+}
+
+static inline void __attribute__((always_inline))
+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint32_t used_idx_start)
+{
+   if (used_idx_start + vq->shadow_used_idx < vq->size) {
+   rte_memcpy(>used->ring[used_idx_start],
+   >shadow_used_ring[0],
+   vq->shadow_used_idx *
+   sizeof(struct vring_used_elem));
+   vhost_log_used_vring(dev, vq,
+   offsetof(struct vring_used,
+   ring[used_idx_start]),
+   vq->shadow_used_idx *
+   sizeof(struct vring_used_elem));
+   } else {
+   uint32_t part_1 = vq->size - used_idx_start;
+   uint32_t part_2 = vq->shadow_used_idx - part_1;

-   vq->used->ring[used_idx_round].id = desc_chain_head;
-   vq->used->ring[used_idx_round].len = desc_chain_len;
-   vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
-   ring[used_idx_round]),
-   sizeof(vq->used->ring[used_idx_round]));
+   rte_memcpy(>used->ring[used_idx_start],
+   >shadow_used_ring[0],
+   part_1 *
+   sizeof(struct vring_used_elem));
+   vhost_log_used_vring(dev, vq,
+   offsetof(struct vring_used,
+   ring[used_idx_start]),
+   part_1 *
+   sizeof(struct vring_used_elem));
+   rte_memcpy(>used->ring[0],
+   >shadow_used_ring[part_1],
+   part_2 *
+   sizeof(struct vring_used_elem));
+   vhost_log_used_vring(dev, vq,
+   offsetof(struct vring_used,
+   ring[0]),
+   part_2 *
+   sizeof(struct vring_used_elem));
+   }
 }

 static inline uint32_t __attribute__((always_inline))
@@ -208,7 +243,7 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
goto error;
} else if (is_mrg_rxbuf) {
/* start with the next desc chain */
-   update_used_ring(dev, vq, desc_chain_head,
+   update_used_ring(vq, desc_chain_head,
desc_chain_len);
vq->last_used_idx++;
extra_buffers++;
@@ -245,7 +280,7 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
desc_chain_len += copy_len;
}

-   update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
+   update_used_ring(vq, desc

[dpdk-dev] [PATCH v4 4/6] vhost: add desc prefetch

2016-08-30 Thread Zhihong Wang
This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_rxtx.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 629e8ae..927896c 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -304,6 +304,12 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
/* start enqueuing packets 1 by 1 */
avail_idx = *((volatile uint16_t *)>avail->idx);
while (pkt_left && avail_idx != vq->last_used_idx) {
+   /* prefetch the next desc */
+   if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+   rte_prefetch0(>desc[vq->avail->ring[
+   (vq->last_used_idx + 1) &
+   (vq->size - 1)]]);
+
if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
is_mrg_rxbuf))
break;
-- 
2.7.4



[dpdk-dev] [PATCH v4 3/6] vhost: remove useless volatile

2016-08-30 Thread Zhihong Wang
This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost-net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..51fdf3d 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_tsize;

/* Last index used on the available ring */
-   volatile uint16_t   last_used_idx;
+   uint16_tlast_used_idx;
 #define VIRTIO_INVALID_EVENTFD (-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD   (-2)

-- 
2.7.4



[dpdk-dev] [PATCH v4 2/6] vhost: rewrite enqueue

2016-08-30 Thread Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.

---
Changes in v4:

 1. Refactor the code for clearer logic.

 2. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Rewrite enqueue and delete the obsolete in the same patch.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_rxtx.c | 525 --
 1 file changed, 145 insertions(+), 380 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 5806f99..629e8ae 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t 
qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }

-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
cksum));
break;
}
+   } else {
+   net_hdr->flags = 0;
+   net_hdr->csum_start = 0;
+   net_hdr->csum_offset = 0;
}

if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,437 +126,198 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
net_hdr->gso_size = m_buf->tso_segsz;
net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ m_buf->l4_len;
+   } else {
+   net_hdr->gso_type = 0;
+   net_hdr->hdr_len = 0;
+   net_hdr->gso_size = 0;
}
 }

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-   struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint32_t desc_chain_head, uint32_t desc_chain_len)
 {
-   if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-   *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-   else
-   *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+   uint32_t used_idx_round = vq->last_used_idx & (vq->size - 1);
+
+   vq->used->ring[used_idx_round].id = desc_chain_head;
+   vq->used->ring[used_idx_round].len = desc_chain_len;
+   vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+   ring[used_idx_round]),
+   sizeof(vq->used->ring[used_idx_round]));
 }

-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint16_t avail_idx, struct rte_mbuf *mbuf,
+   uint32_t is_mrg_rxbuf)
 {
-   uint32_t desc_avail, desc_offset;
-   uint32_t mbuf_avail, mbuf_offset;
-   uint32_t cpy_len;
+   struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
uint64_t desc_addr;
-   struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
-   desc = >desc[desc_idx];
+   uint32_t desc_chain_head;
+   uint32_t desc_chain_len;
+   uint32_t desc_current;
+   uint32_t desc_offset;
+   uint32_t mbuf_len;
+   uint32_t mbuf_avail;
+   uint32_t copy_len;
+   uint32_t extra_buffers = 0;
+
+   /* start with the first mbuf of the packet */
+   mbuf_len = rte_pktmbuf_data_len(mbuf);
+   mbuf_avail = mbuf_len;
+
+   /* get the current desc */
+   desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+   desc_chain_head = desc_current;
+   desc = >desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
-   /*
-* Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-* performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-* otherwise stores offset on the stack instead of in a register.
-*/
-   if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-   return -1;
+   if (unlikely(!desc_addr))
+   goto error;

-   rte_prefetch0((void *)(uintptr_t)desc_addr);
+   /* handle virtio header */
+   virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+   v

[dpdk-dev] [PATCH v4 1/6] vhost: fix windows vm hang

2016-08-30 Thread Zhihong Wang
This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code,
which causes the guest to hang once any packets are enqueued when mrg_rxbuf
is turned on.

How to test?

 1. Start testpmd in the host with a vhost port.

 2. Start a Windows VM image with qemu and connect to the vhost port.

 3. Start io forwarding with tx_first in host testpmd.

For 16.07 code, the Windows VM will hang once any packets are enqueued.

Cc: 
Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_rxtx.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..5806f99 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
uint16_t start_idx = vq->last_used_idx;
uint16_t cur_idx = start_idx;
uint64_t desc_addr;
+   uint32_t desc_chain_head;
+   uint32_t desc_chain_len;
uint32_t mbuf_offset, mbuf_avail;
uint32_t desc_offset, desc_avail;
uint32_t cpy_len;
@@ -412,6 +414,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct 
vhost_virtqueue *vq,

desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
desc_offset = dev->vhost_hlen;
+   desc_chain_head = buf_vec[vec_idx].desc_idx;
+   desc_chain_len = desc_offset;

mbuf_avail  = rte_pktmbuf_data_len(m);
mbuf_offset = 0;
@@ -419,19 +423,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, 
struct vhost_virtqueue *vq,
/* done with current desc buf, get the next one */
if (desc_avail == 0) {
desc_idx = buf_vec[vec_idx].desc_idx;
+   vec_idx++;

if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
/* Update used ring with desc information */
used_idx = cur_idx++ & (vq->size - 1);
-   vq->used->ring[used_idx].id  = desc_idx;
-   vq->used->ring[used_idx].len = desc_offset;
+   vq->used->ring[used_idx].id = desc_chain_head;
+   vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used,
 ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
+   desc_chain_head = buf_vec[vec_idx].desc_idx;
+   desc_chain_len = 0;
}

-   vec_idx++;
desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
if (unlikely(!desc_addr))
return 0;
@@ -463,11 +469,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, 
struct vhost_virtqueue *vq,
mbuf_offset += cpy_len;
desc_avail  -= cpy_len;
desc_offset += cpy_len;
+   desc_chain_len += cpy_len;
}

used_idx = cur_idx & (vq->size - 1);
-   vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-   vq->used->ring[used_idx].len = desc_offset;
+   vq->used->ring[used_idx].id = desc_chain_head;
+   vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used, ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
-- 
2.7.4



[dpdk-dev] [PATCH v4 0/6] vhost: optimize enqueue

2016-08-30 Thread Zhihong Wang
This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

 *  Higher maximum throughput can be achieved for fast frontends like DPDK
virtio pmd.

 *  Better scalability can be achieved that each vhost core can support
more connections because it takes less cycles to handle each single
frontend.

This patch set contains:

 1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.

 2. A baseline patch to rewrite the vhost logic.

 3. A series of optimization patches added upon the baseline.

The main optimization techniques are:

 1. Reorder code to reduce CPU pipeline stall cycles.

 2. Batch update the used ring for better efficiency.

 3. Prefetch descriptor to hide cache latency.

 4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.

Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.

---
Changes in v4:

 1. Fix a Windows VM compatibility issue.

 2. Free shadow used ring in the right place.

 3. Add failure check for shadow used ring malloc.

 4. Refactor the code for clearer logic.

 5. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

 1. Split the big function into several small ones.

 2. Use multiple patches to explain each optimization.

 3. Add comments.

Zhihong Wang (6):
  vhost: fix windows vm hang
  vhost: rewrite enqueue
  vhost: remove useless volatile
  vhost: add desc prefetch
  vhost: batch update used ring
  vhost: optimize cache access

 lib/librte_vhost/vhost-net.h  |   6 +-
 lib/librte_vhost/vhost_rxtx.c | 572 +++---
 lib/librte_vhost/virtio-net.c |  42 +++-
 3 files changed, 244 insertions(+), 376 deletions(-)

-- 
2.7.4



[dpdk-dev] [PATCH v3 5/5] vhost: optimize cache access

2016-08-19 Thread Zhihong Wang
This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_rxtx.c | 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index c4abaf1..e3ba4e0 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -154,6 +154,7 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
uint32_t mbuf_len = 0;
uint32_t mbuf_avail = 0;
uint32_t copy_len = 0;
+   uint32_t copy_virtio_hdr = 0;
uint32_t extra_buffers = 0;

/* start with the first mbuf of the packet */
@@ -168,15 +169,16 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
if (unlikely(!desc_addr))
goto error;

-   /* handle virtio header */
+   /*
+* handle virtio header, the actual write operation
+* is delayed for cache optimization.
+*/
virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
-   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+   copy_virtio_hdr = 1;
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
desc_offset = dev->vhost_hlen;
desc_chain_len = desc_offset;
desc_addr += desc_offset;
-   if (is_mrg_rxbuf)
-   virtio_hdr->num_buffers = 1;

/* start copy from mbuf to desc */
while (1) {
@@ -228,8 +230,15 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
goto rollback;
}

-   /* copy mbuf data */
+   /* copy virtio header and mbuf data */
copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+   if (copy_virtio_hdr) {
+   copy_virtio_hdr = 0;
+   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+   if (is_mrg_rxbuf)
+   virtio_hdr->num_buffers = extra_buffers + 1;
+   }
+
rte_memcpy((void *)(uintptr_t)desc_addr,
rte_pktmbuf_mtod_offset(mbuf, void *,
mbuf_len - mbuf_avail),
-- 
2.7.4



[dpdk-dev] [PATCH v3 4/5] vhost: batch update used ring

2016-08-19 Thread Zhihong Wang
This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost-net.h  |  4 +++
 lib/librte_vhost/vhost_rxtx.c | 68 +--
 lib/librte_vhost/virtio-net.c | 15 --
 3 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 51fdf3d..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_tlog_guest_addr;
+
+   /* Shadow used ring for performance */
+   struct vring_used_elem  *shadow_used_ring;
+   uint32_tshadow_used_idx;
 } __rte_cache_aligned;

 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 7523b2d..c4abaf1 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -155,7 +155,6 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
uint32_t mbuf_avail = 0;
uint32_t copy_len = 0;
uint32_t extra_buffers = 0;
-   uint32_t used_idx_round = 0;

/* start with the first mbuf of the packet */
mbuf_len = rte_pktmbuf_data_len(mbuf);
@@ -203,17 +202,11 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
goto rollback;
} else if (is_mrg_rxbuf) {
/* start with the next desc chain */
-   used_idx_round = vq->last_used_idx
-   & (vq->size - 1);
-   vq->used->ring[used_idx_round].id =
+   vq->shadow_used_ring[vq->shadow_used_idx].id =
desc_chain_head;
-   vq->used->ring[used_idx_round].len =
+   vq->shadow_used_ring[vq->shadow_used_idx].len =
desc_chain_len;
-   vhost_log_used_vring(dev, vq,
-   offsetof(struct vring_used,
-   ring[used_idx_round]),
-   sizeof(vq->used->ring[
-   used_idx_round]));
+   vq->shadow_used_idx++;
vq->last_used_idx++;
extra_buffers++;
virtio_hdr->num_buffers++;
@@ -248,12 +241,9 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
desc_chain_len += copy_len;
}

-   used_idx_round = vq->last_used_idx & (vq->size - 1);
-   vq->used->ring[used_idx_round].id = desc_chain_head;
-   vq->used->ring[used_idx_round].len = desc_chain_len;
-   vhost_log_used_vring(dev, vq,
-   offsetof(struct vring_used, ring[used_idx_round]),
-   sizeof(vq->used->ring[used_idx_round]));
+   vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+   vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+   vq->shadow_used_idx++;
vq->last_used_idx++;

return 0;
@@ -268,6 +258,45 @@ error:
 }

 static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint32_t used_idx_start)
+{
+   if (used_idx_start + vq->shadow_used_idx < vq->size) {
+   rte_memcpy(>used->ring[used_idx_start],
+   >shadow_used_ring[0],
+   vq->shadow_used_idx *
+   sizeof(struct vring_used_elem));
+   vhost_log_used_vring(dev, vq,
+   offsetof(struct vring_used,
+   ring[used_idx_start]),
+   vq->shadow_used_idx *
+   sizeof(struct vring_used_elem));
+   } else {
+   uint32_t part_1 = vq->size - used_idx_start;
+   uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+   rte_memcpy(>used->ring[used_idx_start],
+   >shadow_used_ring[0],
+   part_1 *
+   sizeof(struct vring_used_elem));
+   vhost_log_used_vring(dev, vq,
+   offsetof(struct vring_used,
+   ring[used_idx_start]),
+   part_1 *
+   sizeof(struct vring_used_elem));
+ 

[dpdk-dev] [PATCH v3 3/5] vhost: add desc prefetch

2016-08-19 Thread Zhihong Wang
This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_rxtx.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index b09a9c3..7523b2d 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -131,6 +131,11 @@ loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, 
uint32_t pkt_left)
if (pkt_left == 0 || avail_idx == vq->last_used_idx)
return 1;

+   /* prefetch the next desc */
+   if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+   rte_prefetch0(>desc[vq->avail->ring[
+   (vq->last_used_idx + 1) & (vq->size - 1)]]);
+
return 0;
 }

-- 
2.7.4



[dpdk-dev] [PATCH v3 2/5] vhost: remove useless volatile

2016-08-19 Thread Zhihong Wang
This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost-net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..51fdf3d 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_tsize;

/* Last index used on the available ring */
-   volatile uint16_t   last_used_idx;
+   uint16_tlast_used_idx;
 #define VIRTIO_INVALID_EVENTFD (-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD   (-2)

-- 
2.7.4



[dpdk-dev] [PATCH v3 1/5] vhost: rewrite enqueue

2016-08-19 Thread Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

---
Changes in v3:

 1. Rewrite enqueue and delete the obsolete in the same patch.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_rxtx.c | 537 +-
 1 file changed, 160 insertions(+), 377 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..b09a9c3 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t 
qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }

-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -125,427 +125,210 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
}
 }

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-   struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline uint32_t __attribute__((always_inline))
+loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
 {
-   if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-   *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-   else
-   *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+   if (pkt_left == 0 || avail_idx == vq->last_used_idx)
+   return 1;
+
+   return 0;
 }

-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint16_t avail_idx, struct rte_mbuf *mbuf,
+   uint32_t is_mrg_rxbuf)
 {
-   uint32_t desc_avail, desc_offset;
-   uint32_t mbuf_avail, mbuf_offset;
-   uint32_t cpy_len;
+   struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
-   uint64_t desc_addr;
-   struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
-   desc = >desc[desc_idx];
+   uint64_t desc_addr = 0;
+   uint32_t desc_chain_head = 0;
+   uint32_t desc_chain_len = 0;
+   uint32_t desc_current = 0;
+   uint32_t desc_offset = 0;
+   uint32_t mbuf_len = 0;
+   uint32_t mbuf_avail = 0;
+   uint32_t copy_len = 0;
+   uint32_t extra_buffers = 0;
+   uint32_t used_idx_round = 0;
+
+   /* start with the first mbuf of the packet */
+   mbuf_len = rte_pktmbuf_data_len(mbuf);
+   mbuf_avail = mbuf_len;
+
+   /* get the current desc */
+   desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+   desc_chain_head = desc_current;
+   desc = >desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
-   /*
-* Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-* performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-* otherwise stores offset on the stack instead of in a register.
-*/
-   if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-   return -1;
-
-   rte_prefetch0((void *)(uintptr_t)desc_addr);
+   if (unlikely(!desc_addr))
+   goto error;

-   virtio_enqueue_offload(m, _hdr.hdr);
-   copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+   /* handle virtio header */
+   virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
-   PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
desc_offset = dev->vhost_hlen;
-   desc_avail  = desc->len - dev->vhost_hlen;
+   desc_chain_len = desc_offset;
+   desc_addr += desc_offset;
+   if (is_mrg_rxbuf)
+   virtio_hdr->num_buffers = 1;

-   mbuf_avail  = rte_pktmbuf_data_len(m);
-   mbuf_offset = 0;
-   while (mbuf_avail != 0 || m->next != NULL) {
-   /* done with current mbuf, fetch next */
-   if (mbuf_avail == 0) {
-   m = m->next;
-
-   mbuf_offset = 0;
-   mbuf_avail  = rte_pktmbuf_data_len(m);
+   /* start copy from mbuf to desc */
+   while (1) {
+   /* get the next mbuf if the current done */
+   if (!mbuf_avail) {
+   if (mbuf->next) {
+   mbuf = mbuf->next;
+   mbuf_len = rte_pktmbuf

[dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue

2016-08-19 Thread Zhihong Wang
This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

 *  For fast frontends (eg. DPDK virtio pmd), higher performance (maximum
throughput) can be achieved.

 *  For slow frontends (eg. kernel virtio-net), better scalability can be
achieved, each vhost core can support more connections since it takes
less cycles to handle each single frontend.

The main optimization techniques are:

 1. Reorder code to reduce CPU pipeline stall cycles.

 2. Batch update the used ring for better efficiency.

 3. Prefetch descriptor to hide cache latency.

 4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Also, having 2 separated functions increases
maintenance efforts.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

 1. Split the big function into several small ones.

 2. Use multiple patches to explain each optimization.

 3. Add comments.

Zhihong Wang (5):
  vhost: rewrite enqueue
  vhost: remove useless volatile
  vhost: add desc prefetch
  vhost: batch update used ring
  vhost: optimize cache access

 lib/librte_vhost/vhost-net.h  |   6 +-
 lib/librte_vhost/vhost_rxtx.c | 573 +++---
 lib/librte_vhost/virtio-net.c |  15 +-
 3 files changed, 220 insertions(+), 374 deletions(-)

-- 
2.7.4



[dpdk-dev] [PATCH v2 6/6] vhost: optimize cache access

2016-08-18 Thread Zhihong Wang
This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.


Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_rxtx.c | 23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 60d63d3..15f7f9c 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -154,6 +154,7 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
uint32_t mbuf_len = 0;
uint32_t mbuf_len_left = 0;
uint32_t copy_len = 0;
+   uint32_t copy_virtio_hdr = 0;
uint32_t extra_buffers = 0;

/* start with the first mbuf of the packet */
@@ -168,18 +169,17 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
if (unlikely(!desc_host_write_addr))
goto error;

-   /* handle virtio header */
+   /*
+* handle virtio header, the actual write operation
+* is delayed for cache optimization.
+*/
virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)
(uintptr_t)desc_host_write_addr;
-   memset((void *)(uintptr_t)&(virtio_hdr->hdr),
-   0, dev->vhost_hlen);
-   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+   copy_virtio_hdr = 1;
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
desc_write_offset = dev->vhost_hlen;
desc_chain_len = desc_write_offset;
desc_host_write_addr += desc_write_offset;
-   if (is_mrg_rxbuf)
-   virtio_hdr->num_buffers = 1;

/* start copy from mbuf to desc */
while (1) {
@@ -233,9 +233,18 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
goto rollback;
}

-   /* copy mbuf data */
+   /* copy virtio header and mbuf data */
copy_len = RTE_MIN(desc->len - desc_write_offset,
mbuf_len_left);
+   if (copy_virtio_hdr) {
+   copy_virtio_hdr = 0;
+   memset((void *)(uintptr_t)&(virtio_hdr->hdr),
+   0, dev->vhost_hlen);
+   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+   if (is_mrg_rxbuf)
+   virtio_hdr->num_buffers = extra_buffers + 1;
+   }
+
rte_memcpy((void *)(uintptr_t)desc_host_write_addr,
rte_pktmbuf_mtod_offset(mbuf, void *,
mbuf_len - mbuf_len_left),
-- 
2.7.4



[dpdk-dev] [PATCH v2 5/6] vhost: batch update used ring

2016-08-18 Thread Zhihong Wang
This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost-net.h  |  4 +++
 lib/librte_vhost/vhost_rxtx.c | 68 +--
 lib/librte_vhost/virtio-net.c | 15 --
 3 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 51fdf3d..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_tlog_guest_addr;
+
+   /* Shadow used ring for performance */
+   struct vring_used_elem  *shadow_used_ring;
+   uint32_tshadow_used_idx;
 } __rte_cache_aligned;

 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 7db83d0..60d63d3 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -155,7 +155,6 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
uint32_t mbuf_len_left = 0;
uint32_t copy_len = 0;
uint32_t extra_buffers = 0;
-   uint32_t used_idx_round = 0;

/* start with the first mbuf of the packet */
mbuf_len = rte_pktmbuf_data_len(mbuf);
@@ -207,17 +206,11 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
goto rollback;
} else if (is_mrg_rxbuf) {
/* start with the next desc chain */
-   used_idx_round = vq->last_used_idx
-   & (vq->size - 1);
-   vq->used->ring[used_idx_round].id =
+   vq->shadow_used_ring[vq->shadow_used_idx].id =
desc_chain_head;
-   vq->used->ring[used_idx_round].len =
+   vq->shadow_used_ring[vq->shadow_used_idx].len =
desc_chain_len;
-   vhost_log_used_vring(dev, vq,
-   offsetof(struct vring_used,
-   ring[used_idx_round]),
-   sizeof(vq->used->ring[
-   used_idx_round]));
+   vq->shadow_used_idx++;
vq->last_used_idx++;
extra_buffers++;
virtio_hdr->num_buffers++;
@@ -255,12 +248,9 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
desc_chain_len += copy_len;
}

-   used_idx_round = vq->last_used_idx & (vq->size - 1);
-   vq->used->ring[used_idx_round].id = desc_chain_head;
-   vq->used->ring[used_idx_round].len = desc_chain_len;
-   vhost_log_used_vring(dev, vq,
-   offsetof(struct vring_used, ring[used_idx_round]),
-   sizeof(vq->used->ring[used_idx_round]));
+   vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+   vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+   vq->shadow_used_idx++;
vq->last_used_idx++;

return 0;
@@ -275,6 +265,45 @@ error:
 }

 static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint32_t used_idx_start)
+{
+   if (used_idx_start + vq->shadow_used_idx < vq->size) {
+   rte_memcpy(>used->ring[used_idx_start],
+   >shadow_used_ring[0],
+   vq->shadow_used_idx *
+   sizeof(struct vring_used_elem));
+   vhost_log_used_vring(dev, vq,
+   offsetof(struct vring_used,
+   ring[used_idx_start]),
+   vq->shadow_used_idx *
+   sizeof(struct vring_used_elem));
+   } else {
+   uint32_t part_1 = vq->size - used_idx_start;
+   uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+   rte_memcpy(>used->ring[used_idx_start],
+   >shadow_used_ring[0],
+   part_1 *
+   sizeof(struct vring_used_elem));
+   vhost_log_used_vring(dev, vq,
+   offsetof(struct vring_used,
+   ring[used_idx_start]),
+   part_1 *
+   sizeof(struct vring_used_elem));
+ 

[dpdk-dev] [PATCH v2 4/6] vhost: add desc prefetch

2016-08-18 Thread Zhihong Wang
This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_rxtx.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 939957d..7db83d0 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -131,6 +131,11 @@ loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, 
uint32_t pkt_left)
if (pkt_left == 0 || avail_idx == vq->last_used_idx)
return 1;

+   /* prefetch the next desc */
+   if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+   rte_prefetch0(>desc[vq->avail->ring[
+   (vq->last_used_idx + 1) & (vq->size - 1)]]);
+
return 0;
 }

-- 
2.7.4



[dpdk-dev] [PATCH v2 3/6] vhost: remove useless volatile

2016-08-18 Thread Zhihong Wang
This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost-net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..51fdf3d 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_tsize;

/* Last index used on the available ring */
-   volatile uint16_t   last_used_idx;
+   uint16_tlast_used_idx;
 #define VIRTIO_INVALID_EVENTFD (-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD   (-2)

-- 
2.7.4



[dpdk-dev] [PATCH v2 2/6] vhost: remove obsolete

2016-08-18 Thread Zhihong Wang
This patch removes obsolete functions.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_rxtx.c | 408 --
 1 file changed, 408 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 8e6d782..939957d 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -125,414 +125,6 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
}
 }

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-   struct virtio_net_hdr_mrg_rxbuf hdr)
-{
-   if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-   *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-   else
-   *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
-}
-
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
-{
-   uint32_t desc_avail, desc_offset;
-   uint32_t mbuf_avail, mbuf_offset;
-   uint32_t cpy_len;
-   struct vring_desc *desc;
-   uint64_t desc_addr;
-   struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
-   desc = >desc[desc_idx];
-   desc_addr = gpa_to_vva(dev, desc->addr);
-   /*
-* Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-* performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-* otherwise stores offset on the stack instead of in a register.
-*/
-   if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-   return -1;
-
-   rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-   virtio_enqueue_offload(m, _hdr.hdr);
-   copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-   vhost_log_write(dev, desc->addr, dev->vhost_hlen);
-   PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-   desc_offset = dev->vhost_hlen;
-   desc_avail  = desc->len - dev->vhost_hlen;
-
-   mbuf_avail  = rte_pktmbuf_data_len(m);
-   mbuf_offset = 0;
-   while (mbuf_avail != 0 || m->next != NULL) {
-   /* done with current mbuf, fetch next */
-   if (mbuf_avail == 0) {
-   m = m->next;
-
-   mbuf_offset = 0;
-   mbuf_avail  = rte_pktmbuf_data_len(m);
-   }
-
-   /* done with current desc buf, fetch next */
-   if (desc_avail == 0) {
-   if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
-   /* Room in vring buffer is not enough */
-   return -1;
-   }
-   if (unlikely(desc->next >= vq->size))
-   return -1;
-
-   desc = >desc[desc->next];
-   desc_addr = gpa_to_vva(dev, desc->addr);
-   if (unlikely(!desc_addr))
-   return -1;
-
-   desc_offset = 0;
-   desc_avail  = desc->len;
-   }
-
-   cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-   rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-   rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-   cpy_len);
-   vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
-   PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-cpy_len, 0);
-
-   mbuf_avail  -= cpy_len;
-   mbuf_offset += cpy_len;
-   desc_avail  -= cpy_len;
-   desc_offset += cpy_len;
-   }
-
-   return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
-   struct vhost_virtqueue *vq;
-   uint16_t avail_idx, free_entries, start_idx;
-   uint16_t desc_indexes[MAX_PKT_BURST];
-   uint16_t used_idx;
-   uint32_t i;
-
-   LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-   if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-   RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-   dev->vid, __func__, queue_id);
-   

[dpdk-dev] [PATCH v2 1/6] vhost: rewrite enqueue

2016-08-18 Thread Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_rxtx.c | 212 --
 1 file changed, 205 insertions(+), 7 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..8e6d782 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t 
qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }

-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -533,19 +533,217 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t 
queue_id,
return pkt_idx;
 }

+static inline uint32_t __attribute__((always_inline))
+loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
+{
+   if (pkt_left == 0 || avail_idx == vq->last_used_idx)
+   return 1;
+
+   return 0;
+}
+
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint16_t avail_idx, struct rte_mbuf *mbuf,
+   uint32_t is_mrg_rxbuf)
+{
+   struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
+   struct vring_desc *desc;
+   uint64_t desc_host_write_addr = 0;
+   uint32_t desc_chain_head = 0;
+   uint32_t desc_chain_len = 0;
+   uint32_t desc_current = 0;
+   uint32_t desc_write_offset = 0;
+   uint32_t mbuf_len = 0;
+   uint32_t mbuf_len_left = 0;
+   uint32_t copy_len = 0;
+   uint32_t extra_buffers = 0;
+   uint32_t used_idx_round = 0;
+
+   /* start with the first mbuf of the packet */
+   mbuf_len = rte_pktmbuf_data_len(mbuf);
+   mbuf_len_left = mbuf_len;
+
+   /* get the current desc */
+   desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+   desc_chain_head = desc_current;
+   desc = >desc[desc_current];
+   desc_host_write_addr = gpa_to_vva(dev, desc->addr);
+   if (unlikely(!desc_host_write_addr))
+   goto error;
+
+   /* handle virtio header */
+   virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)
+   (uintptr_t)desc_host_write_addr;
+   memset((void *)(uintptr_t)&(virtio_hdr->hdr),
+   0, dev->vhost_hlen);
+   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+   vhost_log_write(dev, desc->addr, dev->vhost_hlen);
+   desc_write_offset = dev->vhost_hlen;
+   desc_chain_len = desc_write_offset;
+   desc_host_write_addr += desc_write_offset;
+   if (is_mrg_rxbuf)
+   virtio_hdr->num_buffers = 1;
+
+   /* start copy from mbuf to desc */
+   while (1) {
+   /* get the next mbuf if the current done */
+   if (!mbuf_len_left) {
+   if (mbuf->next) {
+   mbuf = mbuf->next;
+   mbuf_len = rte_pktmbuf_data_len(mbuf);
+   mbuf_len_left = mbuf_len;
+   } else
+   break;
+   }
+
+   /* get the next desc if the current done */
+   if (desc->len <= desc_write_offset) {
+   if (desc->flags & VRING_DESC_F_NEXT) {
+   /* go on with the current desc chain */
+   desc_write_offset = 0;
+   desc_current = desc->next;
+   desc = >desc[desc_current];
+   desc_host_write_addr =
+   gpa_to_vva(dev, desc->addr);
+   if (unlikely(!desc_host_write_addr))
+   goto rollback;
+   } else if (is_mrg_rxbuf) {
+   /* start with the next desc chain */
+   used_idx_round = vq->last_used_idx
+   & (vq->size - 1);
+   vq->used->ring[used_idx_round].id =
+   desc_chain_head;
+   vq->used->ring[used_idx_round].len =
+   desc_chain_len;
+   vhost_log_used_vring(dev, vq,
+   offsetof(struct vring_used,
+   ring[used_idx_round]),
+   sizeof(vq->used->ring[
+

[dpdk-dev] [PATCH v2 0/6] vhost: optimize enqueue

2016-08-18 Thread Zhihong Wang
This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

 *  For fast frontends (eg. DPDK virtio pmd), higher performance (maximum
throughput) can be achieved.

 *  For slow frontends (eg. kernel virtio-net), better scalability can be
achieved, each vhost core can support more connections since it takes
less cycles to handle each single frontend.

The main optimization techniques are:

 1. Reorder code to reduce CPU pipeline stall cycles.

 2. Batch update the used ring for better efficiency.

 3. Prefetch descriptor to hide cache latency.

 4. Remove useless volatile attribute to allow compiler optimization.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Also, having 2 separated functions increases
maintenance efforts.

---
Changes in v2:

 1. Split the big function into several small ones

 2. Use multiple patches to explain each optimization

 3. Add comments

Zhihong Wang (6):
  vhost: rewrite enqueue
  vhost: remove obsolete
  vhost: remove useless volatile
  vhost: add desc prefetch
  vhost: batch update used ring
  vhost: optimize cache access

 lib/librte_vhost/vhost-net.h  |   6 +-
 lib/librte_vhost/vhost_rxtx.c | 582 +++---
 lib/librte_vhost/virtio-net.c |  15 +-
 3 files changed, 228 insertions(+), 375 deletions(-)

-- 
2.7.4



[dpdk-dev] [PATCH] optimize vhost enqueue

2016-08-16 Thread Zhihong Wang
This patch optimizes the vhost enqueue function: rte_vhost_enqueue_burst.

Currently there're 2 callbacks for vhost enqueue:
 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The virtio_dev_merge_rx doesn't provide optimal performance, also it is
reported having compatibility issue working with Windows VMs.

Besides, having 2 separated functions increases maintenance efforts.

This patch uses a single function logic to replace the current 2 for
better maintainability, and provides better performance by optimizing
caching behavior especially for mrg_rxbuf turned on cases.

It also fixes the issue working with Windows VMs.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost-net.h  |   6 +-
 lib/librte_vhost/vhost_rxtx.c | 582 ++
 lib/librte_vhost/virtio-net.c |  15 +-
 3 files changed, 208 insertions(+), 395 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_tsize;

/* Last index used on the available ring */
-   volatile uint16_t   last_used_idx;
+   uint16_tlast_used_idx;
 #define VIRTIO_INVALID_EVENTFD (-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD   (-2)

@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_tlog_guest_addr;
+
+   /* Shadow used ring for performance */
+   struct vring_used_elem  *shadow_used_ring;
+   uint32_tshadow_used_idx;
 } __rte_cache_aligned;

 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..1263168 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t 
qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }

-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -125,427 +125,227 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
}
 }

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-   struct virtio_net_hdr_mrg_rxbuf hdr)
-{
-   if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-   *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-   else
-   *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
-}
-
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+   struct rte_mbuf **pkts, uint16_t count)
 {
-   uint32_t desc_avail, desc_offset;
-   uint32_t mbuf_avail, mbuf_offset;
-   uint32_t cpy_len;
+   struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
+   struct vhost_virtqueue *vq;
struct vring_desc *desc;
-   uint64_t desc_addr;
-   struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
-   desc = >desc[desc_idx];
-   desc_addr = gpa_to_vva(dev, desc->addr);
-   /*
-* Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-* performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-* otherwise stores offset on the stack instead of in a register.
-*/
-   if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-   return -1;
-
-   rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-   virtio_enqueue_offload(m, _hdr.hdr);
-   copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-   vhost_log_write(dev, desc->addr, dev->vhost_hlen);
-   PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-   desc_offset = dev->vhost_hlen;
-   desc_avail  = desc->len - dev->vhost_hlen;
-
-   mbuf_avail  = rte_pktmbuf_data_len(m);
-   mbuf_offset = 0;
-   while (mbuf_avail != 0 || m->next != NULL) {
-   /* done with current mbuf, fetch next */
-   if (mbuf_avail == 0) {
-   m = m->next;
-
-   mbuf_offset = 0;
-   mbuf_avail  = rte_pktmbuf_data_len(m);
-   }
-
-   /* done with current desc buf, fetch next */
-   if (desc_avail == 0) {
-   if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
-   /* Room in 

[dpdk-dev] [PATCH v3] doc: virtio PMD Rx/Tx callbacks

2016-07-01 Thread Zhihong Wang
This patch explains current virtio PMD Rx/Tx callbacks, to help understand
what's the difference, and how to enable the right ones.

Signed-off-by: Zhihong Wang 
Acked-by: John McNamara 

--
Changes in v3:

   1. Rephrase for clearer description.

--
Changes in v2:

   1. Changes on format and few descriptions.

---
 doc/guides/nics/virtio.rst | 75 ++
 1 file changed, 69 insertions(+), 6 deletions(-)

diff --git a/doc/guides/nics/virtio.rst b/doc/guides/nics/virtio.rst
index 06ca433..c6335d4 100644
--- a/doc/guides/nics/virtio.rst
+++ b/doc/guides/nics/virtio.rst
@@ -73,7 +73,7 @@ In this release, the virtio PMD driver provides the basic 
functionality of packe

 *   It supports multicast packets and promiscuous mode.

-*   The descriptor number for the RX/TX queue is hard-coded to be 256 by qemu.
+*   The descriptor number for the Rx/Tx queue is hard-coded to be 256 by qemu.
 If given a different descriptor number by the upper application,
 the virtio PMD generates a warning and fall back to the hard-coded value.

@@ -163,8 +163,9 @@ Host2VM communication example
 which means received packets come from vEth0, and transmitted packets is 
sent to vEth0.

 #.  In the guest, bind the virtio device to the uio_pci_generic kernel module 
and start the forwarding application.
-When the virtio port in guest bursts rx, it is getting packets from the 
raw socket's receive queue.
-When the virtio port bursts tx, it is sending packet to the tx_q.
+When the virtio port in guest bursts Rx, it is getting packets from the
+raw socket's receive queue.
+When the virtio port bursts Tx, it is sending packet to the tx_q.

 .. code-block:: console

@@ -183,7 +184,9 @@ Host2VM communication example

 The packet reception and transmission flow path is:

-IXIA packet generator->82599 PF->KNI rx queue->KNI raw socket queue->Guest 
VM virtio port 0 rx burst->Guest VM virtio port 0 tx burst-> KNI tx 
queue->82599 PF-> IXIA packet generator
+IXIA packet generator->82599 PF->KNI Rx queue->KNI raw socket queue->Guest
+VM virtio port 0 Rx burst->Guest VM virtio port 0 Tx burst-> KNI Tx queue
+->82599 PF-> IXIA packet generator

 Virtio with qemu virtio Back End
 
@@ -206,8 +209,68 @@ Virtio with qemu virtio Back End

 In this example, the packet reception flow path is:

-IXIA packet generator->82599 PF->Linux Bridge->TAP0's socket queue-> Guest 
VM virtio port 0 rx burst-> Guest VM 82599 VF port1 tx burst-> IXIA packet 
generator
+IXIA packet generator->82599 PF->Linux Bridge->TAP0's socket queue-> Guest
+VM virtio port 0 Rx burst-> Guest VM 82599 VF port1 Tx burst-> IXIA packet
+generator

 The packet transmission flow is:

-IXIA packet generator-> Guest VM 82599 VF port1 rx burst-> Guest VM virtio 
port 0 tx burst-> tap -> Linux Bridge->82599 PF-> IXIA packet generator
+IXIA packet generator-> Guest VM 82599 VF port1 Rx burst-> Guest VM virtio
+port 0 Tx burst-> tap -> Linux Bridge->82599 PF-> IXIA packet generator
+
+
+Virtio PMD Rx/Tx Callbacks
+--
+
+Virtio driver has 3 Rx callbacks and 2 Tx callbacks.
+
+Rx callbacks:
+
+#. ``virtio_recv_pkts``:
+   Regular version without mergeable Rx buffer support.
+
+#. ``virtio_recv_mergeable_pkts``:
+   Regular version with mergeable Rx buffer support.
+
+#. ``virtio_recv_pkts_vec``:
+   Vector version without mergeable Rx buffer support, also fixes the available
+   ring indexes and uses vector instructions to optimize performance.
+
+Tx callbacks:
+
+#. ``virtio_xmit_pkts``:
+   Regular version.
+
+#. ``virtio_xmit_pkts_simple``:
+   Vector version fixes the available ring indexes to optimize performance.
+
+
+By default, the non-vector callbacks are used:
+
+*   For Rx: If mergeable Rx buffers is disabled then ``virtio_recv_pkts`` is
+used; otherwise ``virtio_recv_mergeable_pkts``.
+
+*   For Tx: ``virtio_xmit_pkts``.
+
+
+Vector callbacks will be used when:
+
+*   ``txq_flags`` is set to ``VIRTIO_SIMPLE_FLAGS`` (0xF01), which implies:
+
+*   Single segment is specified.
+
+*   No offload support is needed.
+
+*   Mergeable Rx buffers is disabled.
+
+The corresponding callbacks are:
+
+*   For Rx: ``virtio_recv_pkts_vec``.
+
+*   For Tx: ``virtio_xmit_pkts_simple``.
+
+
+Example of using the vector version of the virtio poll mode driver in
+``testpmd``::
+
+   testpmd -c 0x7 -n 4 -- -i --txqflags=0xF01 --rxq=1 --txq=1 --nb-cores=1
-- 
2.5.0



[dpdk-dev] [PATCH v2] doc: virtio pmd versions

2016-06-14 Thread Zhihong Wang
This patch explains all the versions of current virtio pmd implementation,
what's the difference, and how to choose the right version.

--
Changes in v2:

   1. Changes on format and few descriptions.


Signed-off-by: Zhihong Wang 
---
 doc/guides/nics/virtio.rst | 64 +-
 1 file changed, 58 insertions(+), 6 deletions(-)

diff --git a/doc/guides/nics/virtio.rst b/doc/guides/nics/virtio.rst
index 06ca433..a4fef89 100644
--- a/doc/guides/nics/virtio.rst
+++ b/doc/guides/nics/virtio.rst
@@ -73,7 +73,7 @@ In this release, the virtio PMD driver provides the basic 
functionality of packe

 *   It supports multicast packets and promiscuous mode.

-*   The descriptor number for the RX/TX queue is hard-coded to be 256 by qemu.
+*   The descriptor number for the Rx/Tx queue is hard-coded to be 256 by qemu.
 If given a different descriptor number by the upper application,
 the virtio PMD generates a warning and fall back to the hard-coded value.

@@ -163,8 +163,8 @@ Host2VM communication example
 which means received packets come from vEth0, and transmitted packets is 
sent to vEth0.

 #.  In the guest, bind the virtio device to the uio_pci_generic kernel module 
and start the forwarding application.
-When the virtio port in guest bursts rx, it is getting packets from the 
raw socket's receive queue.
-When the virtio port bursts tx, it is sending packet to the tx_q.
+When the virtio port in guest bursts Rx, it is getting packets from the 
raw socket's receive queue.
+When the virtio port bursts Tx, it is sending packet to the tx_q.

 .. code-block:: console

@@ -183,7 +183,7 @@ Host2VM communication example

 The packet reception and transmission flow path is:

-IXIA packet generator->82599 PF->KNI rx queue->KNI raw socket queue->Guest 
VM virtio port 0 rx burst->Guest VM virtio port 0 tx burst-> KNI tx 
queue->82599 PF-> IXIA packet generator
+IXIA packet generator->82599 PF->KNI Rx queue->KNI raw socket queue->Guest 
VM virtio port 0 Rx burst->Guest VM virtio port 0 Tx burst-> KNI Tx 
queue->82599 PF-> IXIA packet generator

 Virtio with qemu virtio Back End
 
@@ -206,8 +206,60 @@ Virtio with qemu virtio Back End

 In this example, the packet reception flow path is:

-IXIA packet generator->82599 PF->Linux Bridge->TAP0's socket queue-> Guest 
VM virtio port 0 rx burst-> Guest VM 82599 VF port1 tx burst-> IXIA packet 
generator
+IXIA packet generator->82599 PF->Linux Bridge->TAP0's socket queue-> Guest 
VM virtio port 0 Rx burst-> Guest VM 82599 VF port1 Tx burst-> IXIA packet 
generator

 The packet transmission flow is:

-IXIA packet generator-> Guest VM 82599 VF port1 rx burst-> Guest VM virtio 
port 0 tx burst-> tap -> Linux Bridge->82599 PF-> IXIA packet generator
+IXIA packet generator-> Guest VM 82599 VF port1 Rx burst-> Guest VM virtio 
port 0 Tx burst-> tap -> Linux Bridge->82599 PF-> IXIA packet generator
+
+
+Virtio PMD Versions
+---
+
+Virtio driver has 3 versions of Rx functions and 2 versions of Tx functions.
+
+Rx functions:
+
+#. ``virtio_recv_pkts``:
+   Regular version without mergeable Rx buffer support.
+
+#. ``virtio_recv_mergeable_pkts``:
+   Regular version with mergeable Rx buffer support.
+
+#. ``virtio_recv_pkts_vec``:
+   Simple version without mergeable Rx buffer support, also fixes the 
available ring indexes and uses vector instructions to optimize performance.
+
+Tx functions:
+
+#. ``virtio_xmit_pkts``:
+   Regular version.
+
+#. ``virtio_xmit_pkts_simple``:
+   Simple version fixes the available ring indexes to optimize performance.
+
+
+By default, the non-vector versions are used:
+
+*   For Rx: If mergeable Rx buffers is disabled then ``virtio_recv_pkts`` is 
used; otherwise ``virtio_recv_mergeable_pkts``.
+
+*   For Tx: ``virtio_xmit_pkts``.
+
+
+Setting ``txq_flags`` to ``VIRTIO_SIMPLE_FLAGS`` (0xF01) enables the simple 
version of the virtio poll mode driver:
+
+*   For Rx: ``virtio_recv_pkts_vec``.
+
+*   For Tx: ``virtio_xmit_pkts_simple``.
+
+
+The simple version will only be enabled when:
+
+*   Mergeable Rx buffers is disabled.
+
+*   Single segment is specified.
+
+*   No offload support is needed.
+
+Example of using the simple version of the virtio poll mode driver in 
``testpmd``::
+
+   testpmd -c 0x7 -n 4 -- -i --txqflags=0xF01 --rxq=1 --txq=1 --nb-cores=1
-- 
2.5.0



[dpdk-dev] [PATCH v3 5/5] testpmd: show topology at forwarding start

2016-06-14 Thread Zhihong Wang
This patch show topology at forwarding start.

"show config fwd" also does this, but showing it directly can reduce the
possibility of misconfiguration.

Currently fwd_config_display() calls fwd_config_setup(), this misleading
behavior will be fixed in other patches.


Signed-off-by: Zhihong Wang 
Acked-by: Pablo de Lara 
---
 app/test-pmd/testpmd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 74b044e..50dddbe 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1016,6 +1016,7 @@ start_packet_forwarding(int with_tx_first)
flush_fwd_rx_queues();

fwd_config_setup();
+   fwd_config_display();
rxtx_config_display();

for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++) {
-- 
2.5.0



[dpdk-dev] [PATCH v3 4/5] testpmd: handle all rxqs in rss setup

2016-06-14 Thread Zhihong Wang
This patch removes constraints in rxq handling when multiqueue is enabled
to handle all the rxqs.

Current testpmd forces a dedicated core for each rxq, some rxqs may be
ignored when core number is less than rxq number, and that causes confusion
and inconvenience.

One example: One Red Hat engineer was doing multiqueue test, there're 2
ports in guest each with 4 queues, and testpmd was used as the forwarding
engine in guest, as usual he used 1 core for forwarding, as a results he
only saw traffic from port 0 queue 0 to port 1 queue 0, then a lot of
emails and quite some time are spent to root cause it, and of course it's
caused by this unreasonable testpmd behavior.  

Moreover, even if we understand this behavior, if we want to test the
above case, we still need 8 cores for a single guest to poll all the
rxqs, obviously this is too expensive.

We met quite a lot cases like this, one recent example:
http://openvswitch.org/pipermail/dev/2016-June/072110.html


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/config.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index ede7c78..4719a08 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1199,19 +1199,13 @@ rss_fwd_config_setup(void)
cur_fwd_config.nb_fwd_ports = nb_fwd_ports;
cur_fwd_config.nb_fwd_streams =
(streamid_t) (nb_q * cur_fwd_config.nb_fwd_ports);
-   if (cur_fwd_config.nb_fwd_streams > cur_fwd_config.nb_fwd_lcores)
-   cur_fwd_config.nb_fwd_streams =
-   (streamid_t)cur_fwd_config.nb_fwd_lcores;
-   else
-   cur_fwd_config.nb_fwd_lcores =
-   (lcoreid_t)cur_fwd_config.nb_fwd_streams;

/* reinitialize forwarding streams */
init_fwd_streams();

setup_fwd_config_of_each_lcore(_fwd_config);
rxp = 0; rxq = 0;
-   for (lc_id = 0; lc_id < cur_fwd_config.nb_fwd_lcores; lc_id++) {
+   for (lc_id = 0; lc_id < cur_fwd_config.nb_fwd_streams; lc_id++) {
struct fwd_stream *fs;

fs = fwd_streams[lc_id];
-- 
2.5.0



[dpdk-dev] [PATCH v3 3/5] testpmd: show throughput in port stats

2016-06-14 Thread Zhihong Wang
This patch adds throughput numbers (in the period since last use of this
command) in port statistics display for "show port stats (port_id|all)".


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/config.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index a85bb5f..ede7c78 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -92,6 +92,7 @@
 #include 
 #include 
 #include 
+#include 

 #include "testpmd.h"

@@ -150,6 +151,11 @@ print_ethaddr(const char *name, struct ether_addr 
*eth_addr)
 void
 nic_stats_display(portid_t port_id)
 {
+   static uint64_t prev_pkts_rx[RTE_MAX_ETHPORTS];
+   static uint64_t prev_pkts_tx[RTE_MAX_ETHPORTS];
+   static uint64_t prev_cycles[RTE_MAX_ETHPORTS];
+   uint64_t diff_pkts_rx, diff_pkts_tx, diff_cycles;
+   uint64_t mpps_rx, mpps_tx;
struct rte_eth_stats stats;
struct rte_port *port = [port_id];
uint8_t i;
@@ -209,6 +215,23 @@ nic_stats_display(portid_t port_id)
}
}

+   diff_cycles = prev_cycles[port_id];
+   prev_cycles[port_id] = rte_rdtsc();
+   if (diff_cycles > 0)
+   diff_cycles = prev_cycles[port_id] - diff_cycles;
+
+   diff_pkts_rx = stats.ipackets - prev_pkts_rx[port_id];
+   diff_pkts_tx = stats.opackets - prev_pkts_tx[port_id];
+   prev_pkts_rx[port_id] = stats.ipackets;
+   prev_pkts_tx[port_id] = stats.opackets;
+   mpps_rx = diff_cycles > 0 ?
+   diff_pkts_rx * rte_get_tsc_hz() / diff_cycles : 0;
+   mpps_tx = diff_cycles > 0 ?
+   diff_pkts_tx * rte_get_tsc_hz() / diff_cycles : 0;
+   printf("\n  Throughput (since last show)\n");
+   printf("  Rx-pps: %12"PRIu64"\n  Tx-pps: %12"PRIu64"\n",
+   mpps_rx, mpps_tx);
+
printf("  %s%s\n",
   nic_stats_border, nic_stats_border);
 }
-- 
2.5.0



[dpdk-dev] [PATCH v3 2/5] testpmd: configurable tx_first burst number

2016-06-14 Thread Zhihong Wang
This patch enables configurable tx_first burst number.

Use "start tx_first (burst_num)" to specify how many bursts of packets to
be sent before forwarding start, or "start tx_first" like before for the
default 1 burst send.


Signed-off-by: Zhihong Wang 
Acked-by: Pablo de Lara 
---
 app/test-pmd/cmdline.c  | 41 +
 app/test-pmd/testpmd.c  |  7 +++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  6 +++--
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index e414c0f..9e0b518 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -5386,6 +5386,46 @@ cmdline_parse_inst_t cmd_start_tx_first = {
},
 };

+/* *** START FORWARDING WITH N TX BURST FIRST *** */
+struct cmd_start_tx_first_n_result {
+   cmdline_fixed_string_t start;
+   cmdline_fixed_string_t tx_first;
+   uint32_t tx_num;
+};
+
+static void
+cmd_start_tx_first_n_parsed(void *parsed_result,
+ __attribute__((unused)) struct cmdline *cl,
+ __attribute__((unused)) void *data)
+{
+   struct cmd_start_tx_first_n_result *res = parsed_result;
+
+   start_packet_forwarding(res->tx_num);
+}
+
+cmdline_parse_token_string_t cmd_start_tx_first_n_start =
+   TOKEN_STRING_INITIALIZER(struct cmd_start_tx_first_n_result,
+   start, "start");
+cmdline_parse_token_string_t cmd_start_tx_first_n_tx_first =
+   TOKEN_STRING_INITIALIZER(struct cmd_start_tx_first_n_result,
+   tx_first, "tx_first");
+cmdline_parse_token_num_t cmd_start_tx_first_n_tx_num =
+   TOKEN_NUM_INITIALIZER(struct cmd_start_tx_first_n_result,
+   tx_num, UINT32);
+
+cmdline_parse_inst_t cmd_start_tx_first_n = {
+   .f = cmd_start_tx_first_n_parsed,
+   .data = NULL,
+   .help_str = "start packet forwarding, after sending  "
+   "bursts of packets",
+   .tokens = {
+   (void *)_start_tx_first_n_start,
+   (void *)_start_tx_first_n_tx_first,
+   (void *)_start_tx_first_n_tx_num,
+   NULL,
+   },
+};
+
 /* *** SET LINK UP *** */
 struct cmd_set_link_up_result {
cmdline_fixed_string_t set;
@@ -10542,6 +10582,7 @@ cmdline_parse_ctx_t main_ctx[] = {
(cmdline_parse_inst_t *)_showcfg,
(cmdline_parse_inst_t *)_start,
(cmdline_parse_inst_t *)_start_tx_first,
+   (cmdline_parse_inst_t *)_start_tx_first_n,
(cmdline_parse_inst_t *)_set_link_up,
(cmdline_parse_inst_t *)_set_link_down,
(cmdline_parse_inst_t *)_reset,
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index dfd27d5..74b044e 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1049,8 +1049,11 @@ start_packet_forwarding(int with_tx_first)
for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++)
(*port_fwd_begin)(fwd_ports_ids[i]);
}
-   launch_packet_forwarding(run_one_txonly_burst_on_core);
-   rte_eal_mp_wait_lcore();
+   while (with_tx_first--) {
+   launch_packet_forwarding(
+   run_one_txonly_burst_on_core);
+   rte_eal_mp_wait_lcore();
+   }
port_fwd_end = tx_only_engine.port_fwd_end;
if (port_fwd_end != NULL) {
for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++)
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst 
b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index d812989..4e19229 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -98,9 +98,11 @@ Start packet forwarding with current configuration::
 start tx_first
 ~~

-Start packet forwarding with current configuration after sending one burst of 
packets::
+Start packet forwarding with current configuration after sending specified 
number of bursts of packets::

-   testpmd> start tx_first
+   testpmd> start tx_first (""|burst_num)
+
+The default burst number is 1 when ``burst_num`` not presented.

 stop
 
-- 
2.5.0



[dpdk-dev] [PATCH v3 1/5] testpmd: add retry option

2016-06-14 Thread Zhihong Wang
This patch adds retry option in testpmd to prevent most packet losses.
It can be enabled by "set fwd  retry". All modes except rxonly
support this option.

Adding retry mechanism expands test case coverage to support scenarios
where packet loss affects test results.


Signed-off-by: Zhihong Wang 
Acked-by: Pablo de Lara 
---
 app/test-pmd/Makefile   |   1 -
 app/test-pmd/cmdline.c  |  75 -
 app/test-pmd/config.c   |  43 ++-
 app/test-pmd/csumonly.c |  12 ++
 app/test-pmd/flowgen.c  |  12 ++
 app/test-pmd/icmpecho.c |  15 +++
 app/test-pmd/iofwd.c|  22 +++-
 app/test-pmd/macfwd-retry.c | 167 
 app/test-pmd/macfwd.c   |  13 +++
 app/test-pmd/macswap.c  |  12 ++
 app/test-pmd/testpmd.c  |   4 +-
 app/test-pmd/testpmd.h  |  11 +-
 app/test-pmd/txonly.c   |  12 ++
 doc/guides/testpmd_app_ug/run_app.rst   |   1 -
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  12 +-
 15 files changed, 224 insertions(+), 188 deletions(-)
 delete mode 100644 app/test-pmd/macfwd-retry.c

diff --git a/app/test-pmd/Makefile b/app/test-pmd/Makefile
index 40039a1..2a0b5a5 100644
--- a/app/test-pmd/Makefile
+++ b/app/test-pmd/Makefile
@@ -50,7 +50,6 @@ SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline.c
 SRCS-y += config.c
 SRCS-y += iofwd.c
 SRCS-y += macfwd.c
-SRCS-y += macfwd-retry.c
 SRCS-y += macswap.c
 SRCS-y += flowgen.c
 SRCS-y += rxonly.c
diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index fd389ac..e414c0f 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -246,8 +246,8 @@ static void cmd_help_long_parsed(void *parsed_result,
"Set number of packets per burst.\n\n"

"set burst tx delay (microseconds) retry (num)\n"
-   "Set the transmit delay time and number of retries"
-   " in mac_retry forwarding mode.\n\n"
+   "Set the transmit delay time and number of retries,"
+   " effective when retry is enabled.\n\n"

"set txpkts (x[,y]*)\n"
"Set the length of each segment of TXONLY"
@@ -4557,6 +4557,7 @@ static void cmd_set_fwd_mode_parsed(void *parsed_result,
 {
struct cmd_set_fwd_mode_result *res = parsed_result;

+   retry_enabled = 0;
set_pkt_forwarding_mode(res->mode);
 }

@@ -4602,6 +4603,74 @@ static void cmd_set_fwd_mode_init(void)
token_struct->string_data.str = token;
 }

+/* *** SET RETRY FORWARDING MODE *** */
+struct cmd_set_fwd_retry_mode_result {
+   cmdline_fixed_string_t set;
+   cmdline_fixed_string_t fwd;
+   cmdline_fixed_string_t mode;
+   cmdline_fixed_string_t retry;
+};
+
+static void cmd_set_fwd_retry_mode_parsed(void *parsed_result,
+   __attribute__((unused)) struct cmdline *cl,
+   __attribute__((unused)) void *data)
+{
+   struct cmd_set_fwd_retry_mode_result *res = parsed_result;
+
+   retry_enabled = 1;
+   set_pkt_forwarding_mode(res->mode);
+}
+
+cmdline_parse_token_string_t cmd_setfwd_retry_set =
+   TOKEN_STRING_INITIALIZER(struct cmd_set_fwd_retry_mode_result,
+   set, "set");
+cmdline_parse_token_string_t cmd_setfwd_retry_fwd =
+   TOKEN_STRING_INITIALIZER(struct cmd_set_fwd_retry_mode_result,
+   fwd, "fwd");
+cmdline_parse_token_string_t cmd_setfwd_retry_mode =
+   TOKEN_STRING_INITIALIZER(struct cmd_set_fwd_retry_mode_result,
+   mode,
+   "" /* defined at init */);
+cmdline_parse_token_string_t cmd_setfwd_retry_retry =
+   TOKEN_STRING_INITIALIZER(struct cmd_set_fwd_retry_mode_result,
+   retry, "retry");
+
+cmdline_parse_inst_t cmd_set_fwd_retry_mode = {
+   .f = cmd_set_fwd_retry_mode_parsed,
+   .data = NULL,
+   .help_str = NULL, /* defined at init */
+   .tokens = {
+   (void *)_setfwd_retry_set,
+   (void *)_setfwd_retry_fwd,
+   (void *)_setfwd_retry_mode,
+   (void *)_setfwd_retry_retry,
+   NULL,
+   },
+};
+
+static void cmd_set_fwd_retry_mode_init(void)
+{
+   char *modes, *c;
+   static char token[128];
+   static char help[256];
+   cmdline_parse_token_string_t *token_struct;
+
+   modes = list_pkt_forwarding_retry_modes();
+   snprintf(help, sizeof(help), "set fwd %s retry - "
+   "set packet forwarding mode with retry", modes);
+   cmd_set_fwd_retry_mode.help_str = h

[dpdk-dev] [PATCH v3 0/5] vhost/virtio performance loopback utility

2016-06-14 Thread Zhihong Wang
This patch enables vhost/virtio pmd performance loopback test in testpmd.
All the features are for general usage.

The loopback test focuses on the maximum full-path packet forwarding
performance between host and guest, it runs vhost/virtio pmd only without
introducing extra overhead.

Therefore, the main requirement is traffic generation, since there's no
other packet generators like IXIA to help.

In current testpmd, iofwd is the best candidate to perform this loopback
test because it's the fastest possible forwarding engine: Start testpmd
iofwd in host with 1 vhost port, and start testpmd iofwd in the connected
guest with 1 corresponding virtio port, and these 2 ports form a forwarding
loop: Host vhost Tx -> Guest virtio Rx -> Guest virtio Tx -> Host vhost Rx.

As to traffic generation, "start tx_first" injects a burst of packets into
the loop.

However 2 issues remain:

   1. If only 1 burst of packets are injected in the loop, there will
  definitely be empty Rx operations, e.g. When guest virtio port send
  burst to the host, then it starts the Rx immediately, it's likely
  the packets are still being forwarded by host vhost port and haven't
  reached the guest yet.

  We need to fill up the ring to keep all pmds busy.

   2. iofwd doesn't provide retry mechanism, so if packet loss occurs,
  there won't be a full burst in the loop.

To address these issues, this patch:

   1. Add retry option in testpmd to prevent most packet losses.

   2. Add parameter to enable configurable tx_first burst number.

Other related improvements include:

   1. Handle all rxqs when multiqueue is enabled: Current testpmd forces a
  single core for each rxq which causes inconvenience and confusion.

  This change doesn't break anything, we can still force a single core
  for each rxq, by giving the same number of cores with the number of
  rxqs.

  One example: One Red Hat engineer was doing multiqueue test, there're
  2 ports in guest each with 4 queues, and testpmd was used as the
  forwarding engine in guest, as usual he used 1 core for forwarding, as
  a results he only saw traffic from port 0 queue 0 to port 1 queue 0,
  then a lot of emails and quite some time are spent to root cause it,
  and of course it's caused by this unreasonable testpmd behavior.

  Moreover, even if we understand this behavior, if we want to test the
  above case, we still need 8 cores for a single guest to poll all the
  rxqs, obviously this is too expensive.

  We met quite a lot cases like this, one recent example:
  http://openvswitch.org/pipermail/dev/2016-June/072110.html

   2. Show topology at forwarding start: "show config fwd" also does this,
  but show it directly can reduce the possibility of mis-configuration.

  Like the case above, if testpmd shows topology at forwarding start,
  then probably all those debugging efforts can be saved.

   3. Add throughput information in port statistics display for "show port
  stats (port_id|all)".

Finally there's documentation update.

Example on how to enable vhost/virtio performance loopback test:

   1. Start testpmd in host with 1 vhost port only.

   2. Start testpmd in guest with only 1 virtio port connected to the
  corresponding vhost port.

   3. "set fwd io retry" in testpmds in both host and guest.

   4. "start" in testpmd in guest.

   5. "start tx_first 16" in testpmd in host.

Then use "show port stats all" to monitor the performance.

--
Changes in v2:

   1. Add retry as an option for existing forwarding engines except rxonly.

   2. Minor code adjustment and more detailed patch description.

--
Changes in v3:

   1. Add more details in commit log.

   2. Give variables more meaningful names.

   3. Fix a typo in existing doc.

   4. Rebase the patches.


Zhihong Wang (5):
  testpmd: add retry option
  testpmd: configurable tx_first burst number
  testpmd: show throughput in port stats
  testpmd: handle all rxqs in rss setup
  testpmd: show topology at forwarding start

 app/test-pmd/Makefile   |   1 -
 app/test-pmd/cmdline.c  | 116 ++-
 app/test-pmd/config.c   |  74 ++--
 app/test-pmd/csumonly.c |  12 ++
 app/test-pmd/flowgen.c  |  12 ++
 app/test-pmd/icmpecho.c |  15 +++
 app/test-pmd/iofwd.c|  22 +++-
 app/test-pmd/macfwd-retry.c | 167 
 app/test-pmd/macfwd.c   |  13 +++
 app/test-pmd/macswap.c  |  12 ++
 app/test-pmd/testpmd.c  |  12 +-
 app/test-pmd/testpmd.h  |  11 +-
 app/test-pmd/txonly.c   |  12 ++
 doc/guides/testpmd_app_ug/run_app.rst   |   1 

[dpdk-dev] [PATCH v2 5/5] testpmd: show topology at forwarding start

2016-06-01 Thread Zhihong Wang
This patch show topology at forwarding start.

"show config fwd" also does this, but showing it directly can reduce the
possibility of misconfiguration.


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/cmdline.c | 2 +-
 app/test-pmd/config.c  | 4 ++--
 app/test-pmd/testpmd.c | 2 +-
 app/test-pmd/testpmd.h | 3 +--
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index ef66d4e..bc800f8 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -5445,7 +5445,7 @@ static void cmd_showcfg_parsed(void *parsed_result,
else if (!strcmp(res->what, "cores"))
fwd_lcores_config_display();
else if (!strcmp(res->what, "fwd"))
-   fwd_config_display();
+   fwd_config_setup_display();
else if (!strcmp(res->what, "txpkts"))
show_tx_pkt_segments();
 }
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index cfdacd8..c70f308 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1383,7 +1383,7 @@ icmp_echo_config_setup(void)
}
 }

-void
+static void
 fwd_config_setup(void)
 {
cur_fwd_config.fwd_eng = cur_fwd_eng;
@@ -1443,7 +1443,7 @@ pkt_fwd_config_display(struct fwd_config *cfg)


 void
-fwd_config_display(void)
+fwd_config_setup_display(void)
 {
fwd_config_setup();
pkt_fwd_config_display(_fwd_config);
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 9b1d99c..b946034 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1009,7 +1009,7 @@ start_packet_forwarding(int with_tx_first)
if(!no_flush_rx)
flush_fwd_rx_queues();

-   fwd_config_setup();
+   fwd_config_setup_display();
rxtx_config_display();

for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++) {
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 62ec055..5fd08e8 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -480,9 +480,8 @@ void port_infos_display(portid_t port_id);
 void rx_queue_infos_display(portid_t port_idi, uint16_t queue_id);
 void tx_queue_infos_display(portid_t port_idi, uint16_t queue_id);
 void fwd_lcores_config_display(void);
-void fwd_config_display(void);
+void fwd_config_setup_display(void);
 void rxtx_config_display(void);
-void fwd_config_setup(void);
 void set_def_fwd_config(void);
 void reconfig(portid_t new_port_id, unsigned socket_id);
 int init_fwd_streams(void);
-- 
2.5.0



[dpdk-dev] [PATCH v2 4/5] testpmd: handle all rxqs in rss setup

2016-06-01 Thread Zhihong Wang
This patch removes constraints in rxq handling when multiqueue is enabled
to handle all the rxqs.

Current testpmd forces a dedicated core for each rxq, some rxqs may be
ignored when core number is less than rxq number, and that causes confusion
and inconvenience.


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/config.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index f487b87..cfdacd8 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1196,19 +1196,13 @@ rss_fwd_config_setup(void)
cur_fwd_config.nb_fwd_ports = nb_fwd_ports;
cur_fwd_config.nb_fwd_streams =
(streamid_t) (nb_q * cur_fwd_config.nb_fwd_ports);
-   if (cur_fwd_config.nb_fwd_streams > cur_fwd_config.nb_fwd_lcores)
-   cur_fwd_config.nb_fwd_streams =
-   (streamid_t)cur_fwd_config.nb_fwd_lcores;
-   else
-   cur_fwd_config.nb_fwd_lcores =
-   (lcoreid_t)cur_fwd_config.nb_fwd_streams;

/* reinitialize forwarding streams */
init_fwd_streams();

setup_fwd_config_of_each_lcore(_fwd_config);
rxp = 0; rxq = 0;
-   for (lc_id = 0; lc_id < cur_fwd_config.nb_fwd_lcores; lc_id++) {
+   for (lc_id = 0; lc_id < cur_fwd_config.nb_fwd_streams; lc_id++) {
struct fwd_stream *fs;

fs = fwd_streams[lc_id];
-- 
2.5.0



[dpdk-dev] [PATCH v2 3/5] testpmd: show throughput in port stats

2016-06-01 Thread Zhihong Wang
This patch adds throughput numbers (in the period since last use of this
command) in port statistics display for "show port stats (port_id|all)".


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/config.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index c611649..f487b87 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -92,6 +92,7 @@
 #include 
 #include 
 #include 
+#include 

 #include "testpmd.h"

@@ -150,6 +151,10 @@ print_ethaddr(const char *name, struct ether_addr 
*eth_addr)
 void
 nic_stats_display(portid_t port_id)
 {
+   static uint64_t sum_rx[RTE_MAX_ETHPORTS];
+   static uint64_t sum_tx[RTE_MAX_ETHPORTS];
+   static uint64_t cycles[RTE_MAX_ETHPORTS];
+   uint64_t pkt_rx, pkt_tx, cycle;
struct rte_eth_stats stats;
struct rte_port *port = [port_id];
uint8_t i;
@@ -209,6 +214,21 @@ nic_stats_display(portid_t port_id)
}
}

+   cycle = cycles[port_id];
+   cycles[port_id] = rte_rdtsc();
+   if (cycle > 0)
+   cycle = cycles[port_id] - cycle;
+
+   pkt_rx = stats.ipackets - sum_rx[port_id];
+   pkt_tx = stats.opackets - sum_tx[port_id];
+   sum_rx[port_id] = stats.ipackets;
+   sum_tx[port_id] = stats.opackets;
+   printf("\n  Throughput (since last show)\n");
+   printf("  RX-pps: %12"PRIu64"\n"
+   "  TX-pps: %12"PRIu64"\n",
+   cycle > 0 ? pkt_rx * rte_get_tsc_hz() / cycle : 0,
+   cycle > 0 ? pkt_tx * rte_get_tsc_hz() / cycle : 0);
+
printf("  %s%s\n",
   nic_stats_border, nic_stats_border);
 }
-- 
2.5.0



[dpdk-dev] [PATCH v2 2/5] testpmd: configurable tx_first burst number

2016-06-01 Thread Zhihong Wang
This patch enables configurable tx_first burst number.

Use "start tx_first (burst_num)" to specify how many bursts of packets to
be sent before forwarding start, or "start tx_first" like before for the
default 1 burst send.


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/cmdline.c  | 41 +
 app/test-pmd/testpmd.c  |  7 +++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  6 +++--
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 0af3f05..ef66d4e 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -5309,6 +5309,46 @@ cmdline_parse_inst_t cmd_start_tx_first = {
},
 };

+/* *** START FORWARDING WITH N TX BURST FIRST *** */
+struct cmd_start_tx_first_n_result {
+   cmdline_fixed_string_t start;
+   cmdline_fixed_string_t tx_first;
+   uint32_t tx_num;
+};
+
+static void
+cmd_start_tx_first_n_parsed(void *parsed_result,
+ __attribute__((unused)) struct cmdline *cl,
+ __attribute__((unused)) void *data)
+{
+   struct cmd_start_tx_first_n_result *res = parsed_result;
+
+   start_packet_forwarding(res->tx_num);
+}
+
+cmdline_parse_token_string_t cmd_start_tx_first_n_start =
+   TOKEN_STRING_INITIALIZER(struct cmd_start_tx_first_n_result,
+   start, "start");
+cmdline_parse_token_string_t cmd_start_tx_first_n_tx_first =
+   TOKEN_STRING_INITIALIZER(struct cmd_start_tx_first_n_result,
+   tx_first, "tx_first");
+cmdline_parse_token_num_t cmd_start_tx_first_n_tx_num =
+   TOKEN_NUM_INITIALIZER(struct cmd_start_tx_first_n_result,
+   tx_num, UINT32);
+
+cmdline_parse_inst_t cmd_start_tx_first_n = {
+   .f = cmd_start_tx_first_n_parsed,
+   .data = NULL,
+   .help_str = "start packet forwarding, after sending  "
+   "bursts of packets",
+   .tokens = {
+   (void *)_start_tx_first_n_start,
+   (void *)_start_tx_first_n_tx_first,
+   (void *)_start_tx_first_n_tx_num,
+   NULL,
+   },
+};
+
 /* *** SET LINK UP *** */
 struct cmd_set_link_up_result {
cmdline_fixed_string_t set;
@@ -10468,6 +10508,7 @@ cmdline_parse_ctx_t main_ctx[] = {
(cmdline_parse_inst_t *)_showcfg,
(cmdline_parse_inst_t *)_start,
(cmdline_parse_inst_t *)_start_tx_first,
+   (cmdline_parse_inst_t *)_start_tx_first_n,
(cmdline_parse_inst_t *)_set_link_up,
(cmdline_parse_inst_t *)_set_link_down,
(cmdline_parse_inst_t *)_reset,
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 7ab67b8..9b1d99c 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1043,8 +1043,11 @@ start_packet_forwarding(int with_tx_first)
for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++)
(*port_fwd_begin)(fwd_ports_ids[i]);
}
-   launch_packet_forwarding(run_one_txonly_burst_on_core);
-   rte_eal_mp_wait_lcore();
+   while (with_tx_first--) {
+   launch_packet_forwarding(
+   run_one_txonly_burst_on_core);
+   rte_eal_mp_wait_lcore();
+   }
port_fwd_end = tx_only_engine.port_fwd_end;
if (port_fwd_end != NULL) {
for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++)
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst 
b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 03412db..ff94593 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -98,9 +98,11 @@ Start packet forwarding with current configuration::
 start tx_first
 ~~

-Start packet forwarding with current configuration after sending one burst of 
packets::
+Start packet forwarding with current configuration after sending specified 
number of bursts of packets::

-   testpmd> start tx_first
+   testpmd> start tx_first (""|burst_num)
+
+The default burst number is 1 when ``burst_num`` not presented.

 stop
 
-- 
2.5.0



[dpdk-dev] [PATCH v2 1/5] testpmd: add retry option

2016-06-01 Thread Zhihong Wang
This patch adds retry option in testpmd to prevent most packet losses.
It can be enabled by "set fwd  retry". All modes except rxonly
support this option.

Adding retry mechanism expands test case coverage to support scenarios
where packet loss affects test results.


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/Makefile   |   1 -
 app/test-pmd/cmdline.c  |  75 -
 app/test-pmd/config.c   |  47 +++-
 app/test-pmd/csumonly.c |  12 ++
 app/test-pmd/flowgen.c  |  12 ++
 app/test-pmd/icmpecho.c |  15 +++
 app/test-pmd/iofwd.c|  22 +++-
 app/test-pmd/macfwd-retry.c | 164 
 app/test-pmd/macfwd.c   |  13 +++
 app/test-pmd/macswap.c  |  12 ++
 app/test-pmd/testpmd.c  |   4 +-
 app/test-pmd/testpmd.h  |  11 +-
 app/test-pmd/txonly.c   |  12 ++
 doc/guides/testpmd_app_ug/run_app.rst   |   1 -
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  10 +-
 15 files changed, 227 insertions(+), 184 deletions(-)
 delete mode 100644 app/test-pmd/macfwd-retry.c

diff --git a/app/test-pmd/Makefile b/app/test-pmd/Makefile
index 40039a1..2a0b5a5 100644
--- a/app/test-pmd/Makefile
+++ b/app/test-pmd/Makefile
@@ -50,7 +50,6 @@ SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline.c
 SRCS-y += config.c
 SRCS-y += iofwd.c
 SRCS-y += macfwd.c
-SRCS-y += macfwd-retry.c
 SRCS-y += macswap.c
 SRCS-y += flowgen.c
 SRCS-y += rxonly.c
diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index c5b9479..0af3f05 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -246,8 +246,8 @@ static void cmd_help_long_parsed(void *parsed_result,
"Set number of packets per burst.\n\n"

"set burst tx delay (microseconds) retry (num)\n"
-   "Set the transmit delay time and number of retries"
-   " in mac_retry forwarding mode.\n\n"
+   "Set the transmit delay time and number of retries,"
+   " effective when retry is enabled.\n\n"

"set txpkts (x[,y]*)\n"
"Set the length of each segment of TXONLY"
@@ -4480,6 +4480,7 @@ static void cmd_set_fwd_mode_parsed(void *parsed_result,
 {
struct cmd_set_fwd_mode_result *res = parsed_result;

+   retry_enabled = 0;
set_pkt_forwarding_mode(res->mode);
 }

@@ -4525,6 +4526,74 @@ static void cmd_set_fwd_mode_init(void)
token_struct->string_data.str = token;
 }

+/* *** SET RETRY FORWARDING MODE *** */
+struct cmd_set_fwd_retry_mode_result {
+   cmdline_fixed_string_t set;
+   cmdline_fixed_string_t fwd;
+   cmdline_fixed_string_t mode;
+   cmdline_fixed_string_t retry;
+};
+
+static void cmd_set_fwd_retry_mode_parsed(void *parsed_result,
+   __attribute__((unused)) struct cmdline *cl,
+   __attribute__((unused)) void *data)
+{
+   struct cmd_set_fwd_retry_mode_result *res = parsed_result;
+
+   retry_enabled = 1;
+   set_pkt_forwarding_mode(res->mode);
+}
+
+cmdline_parse_token_string_t cmd_setfwd_retry_set =
+   TOKEN_STRING_INITIALIZER(struct cmd_set_fwd_retry_mode_result,
+   set, "set");
+cmdline_parse_token_string_t cmd_setfwd_retry_fwd =
+   TOKEN_STRING_INITIALIZER(struct cmd_set_fwd_retry_mode_result,
+   fwd, "fwd");
+cmdline_parse_token_string_t cmd_setfwd_retry_mode =
+   TOKEN_STRING_INITIALIZER(struct cmd_set_fwd_retry_mode_result,
+   mode,
+   "" /* defined at init */);
+cmdline_parse_token_string_t cmd_setfwd_retry_retry =
+   TOKEN_STRING_INITIALIZER(struct cmd_set_fwd_retry_mode_result,
+   retry, "retry");
+
+cmdline_parse_inst_t cmd_set_fwd_retry_mode = {
+   .f = cmd_set_fwd_retry_mode_parsed,
+   .data = NULL,
+   .help_str = NULL, /* defined at init */
+   .tokens = {
+   (void *)_setfwd_retry_set,
+   (void *)_setfwd_retry_fwd,
+   (void *)_setfwd_retry_mode,
+   (void *)_setfwd_retry_retry,
+   NULL,
+   },
+};
+
+static void cmd_set_fwd_retry_mode_init(void)
+{
+   char *modes, *c;
+   static char token[128];
+   static char help[256];
+   cmdline_parse_token_string_t *token_struct;
+
+   modes = list_pkt_forwarding_retry_modes();
+   snprintf(help, sizeof(help), "set fwd %s retry - "
+   "set packet forwarding mode with retry", modes);
+   cmd_set_fwd_retry_mode.help_str = help;
+
+   /* string token

[dpdk-dev] [PATCH v2 0/5] vhost/virtio performance loopback utility

2016-06-01 Thread Zhihong Wang
This patch enables vhost/virtio pmd performance loopback test in testpmd.
All the features are for general usage.

The loopback test focuses on the maximum full-path packet forwarding
performance between host and guest, it runs vhost/virtio pmd only without
introducing extra overhead.

Therefore, the main requirement is traffic generation, since there's no
other packet generators like IXIA to help.

In current testpmd, iofwd is the best candidate to perform this loopback
test because it's the fastest possible forwarding engine: Start testpmd
iofwd in host with 1 vhost port, and start testpmd iofwd in the connected
guest with 1 corresponding virtio port, and these 2 ports form a forwarding
loop: Host vhost tx -> Guest virtio rx -> Guest virtio tx -> Host vhost rx.

As to traffic generation, "start tx_first" injects a burst of packets into
the loop.

However 2 issues remain:

   1. If only 1 burst of packets are injected in the loop, there will
  definitely be empty rx operations, e.g. When guest virtio port send
  burst to the host, then it starts the rx immediately, it's likely
  the packets are still being forwarded by host vhost port and haven't
  reached the guest yet.

  We need to fill up the ring to keep all pmds busy.

   2. iofwd doesn't provide retry mechanism, so if packet loss occurs,
  there won't be a full burst in the loop.

To address these issues, this patch:

   1. Add retry option in testpmd to prevent most packet losses.

   2. Add parameter to enable configurable tx_first burst number.

Other related improvements include:

   1. Handle all rxqs when multiqueue is enabled: Current testpmd forces a
  single core for each rxq which causes inconvenience and confusion.

  This change doesn't break anything, we can still force a single core
  for each rxq, by giving the same number of cores with the number of
  rxqs.

  One example: One Red Hat engineer was doing multiqueue test, there're
  2 ports in guest each with 4 queues, and testpmd was used as the
  forwarding engine in guest, as usual he used 1 core for forwarding, as
  a results he only saw traffic from port 0 queue 0 to port 1 queue 0,
  then a lot of emails and quite some time are spent to root cause it,
  and of course it's caused by this unreasonable testpmd behavior.

  Moreover, even if we understand this behavior, if we want to test the
  above case, we still need 8 cores for a single guest to poll all the
  rxqs, obviously this is too expensive.

  We met quite a lot cases like this.

   2. Show topology at forwarding start: "show config fwd" also does this,
  but show it directly can reduce the possibility of mis-configuration.

  Like the case above, if testpmd shows topology at forwarding start,
  then probably all those debugging efforts can be saved.

   3. Add throughput information in port statistics display for "show port
  stats (port_id|all)".

Finally there's documentation update.

Example on how to enable vhost/virtio performance loopback test:

   1. Start testpmd in host with 1 vhost port only.

   2. Start testpmd in guest with only 1 virtio port connected to the
  corresponding vhost port.

   3. "set fwd io retry" in testpmds in both host and guest.

   4. "start" in testpmd in guest.

   5. "start tx_first 16" in testpmd in host.

Then use "show port stats all" to monitor the performance.

--
Changes in v2:

   1. Add retry as an option for existing forwarding engines except rxonly.

   2. Minor code adjustment and more detailed patch description.


Zhihong Wang (5):
  testpmd: add retry option
  testpmd: configurable tx_first burst number
  testpmd: show throughput in port stats
  testpmd: handle all rxqs in rss setup
  testpmd: show topology at forwarding start

 app/test-pmd/Makefile   |   1 -
 app/test-pmd/cmdline.c  | 118 +++-
 app/test-pmd/config.c   |  79 +++---
 app/test-pmd/csumonly.c |  12 ++
 app/test-pmd/flowgen.c  |  12 ++
 app/test-pmd/icmpecho.c |  15 +++
 app/test-pmd/iofwd.c|  22 +++-
 app/test-pmd/macfwd-retry.c | 164 
 app/test-pmd/macfwd.c   |  13 +++
 app/test-pmd/macswap.c  |  12 ++
 app/test-pmd/testpmd.c  |  13 ++-
 app/test-pmd/testpmd.h  |  14 ++-
 app/test-pmd/txonly.c   |  12 ++
 doc/guides/testpmd_app_ug/run_app.rst   |   1 -
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  16 +--
 15 files changed, 303 insertions(+), 201 deletions(-)
 delete mode 100644 app/test-pmd/macfwd-retry.c

-- 
2.5.0



[dpdk-dev] [PATCH] eal: fix rte_memcpy perf in hsw/bdw

2016-05-24 Thread Zhihong Wang
This patch fixes rte_memcpy performance in Haswell and Broadwell for
vhost when copy size larger than 256 bytes.

It is observed that for large copies like 1024/1518 ones, rte_memcpy
suffers high ratio of store buffer full issue which causes pipeline
to stall in scenarios like vhost enqueue. This can be alleviated by
adjusting instruction layout. Note that this issue may not be visible
in micro test.

How to reproduce?

PHY-VM-PHY using vhost/virtio or vhost/virtio loop back, with large
packets like 1024/1518 bytes ones. Make sure packet generation rate
is not the bottleneck if PHY-VM-PHY is used.

Signed-off-by: Zhihong Wang 
---
 .../common/include/arch/x86/rte_memcpy.h   | 116 ++---
 1 file changed, 30 insertions(+), 86 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index f463ab3..413035e 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -363,71 +363,26 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 }

 /**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-   rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-   rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-   rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-   rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-   rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-   rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-   rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-   rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
-
-/**
- * Copy 64-byte blocks from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n)
-{
-   __m256i ymm0, ymm1;
-
-   while (n >= 64) {
-   ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
*)src + 0 * 32));
-   n -= 64;
-   ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
*)src + 1 * 32));
-   src = (const uint8_t *)src + 64;
-   _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
-   _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
-   dst = (uint8_t *)dst + 64;
-   }
-}
-
-/**
- * Copy 256-byte blocks from one location to another,
+ * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
  */
 static inline void
-rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 {
-   __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+   __m256i ymm0, ymm1, ymm2, ymm3;

-   while (n >= 256) {
+   while (n >= 128) {
ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
*)src + 0 * 32));
-   n -= 256;
+   n -= 128;
ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
*)src + 1 * 32));
ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
*)src + 2 * 32));
ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
*)src + 3 * 32));
-   ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
*)src + 4 * 32));
-   ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
*)src + 5 * 32));
-   ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
*)src + 6 * 32));
-   ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
*)src + 7 * 32));
-   src = (const uint8_t *)src + 256;
+   src = (const uint8_t *)src + 128;
_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
-   _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4);
-   _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5);
-   _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6);
-   _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7);
-   dst = (uint8_t *)dst + 256;
+   dst = (uint8_t *)dst + 128;
}
 }

@@ -466,51 +421,56 @@ rte_memcpy(void *dst, const void *src, size_t n)
}

/**
-* Fast way when copy size doesn't exceed 512 bytes
+* Fast way when copy size doesn't exceed 256 bytes
 */
if (n <= 32) {
rte_mov16((u

[dpdk-dev] [PATCH 6/6] testpmd: update documentation

2016-05-05 Thread Zhihong Wang
This patch updates documentation for testpmd.


Signed-off-by: Zhihong Wang 
---
 doc/guides/testpmd_app_ug/run_app.rst   |  1 +
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 10 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/doc/guides/testpmd_app_ug/run_app.rst 
b/doc/guides/testpmd_app_ug/run_app.rst
index f605564..edd3e42 100644
--- a/doc/guides/testpmd_app_ug/run_app.rst
+++ b/doc/guides/testpmd_app_ug/run_app.rst
@@ -328,6 +328,7 @@ The commandline options are:
 Set the forwarding mode where ``mode`` is one of the following::

io (the default)
+   io_retry
mac
mac_retry
mac_swap
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst 
b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index aed5e47..7703c89 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -98,9 +98,11 @@ Start packet forwarding with current configuration::
 start tx_first
 ~~

-Start packet forwarding with current configuration after sending one burst of 
packets::
+Start packet forwarding with current configuration after sending specified 
number of bursts of packets::

-   testpmd> start tx_first
+   testpmd> start tx_first (""|burst_num)
+
+The default burst number is 1 when ``burst_num`` not presented.

 stop
 
@@ -249,7 +251,7 @@ set fwd

 Set the packet forwarding mode::

-   testpmd> set fwd (io|mac|mac_retry|macswap|flowgen| \
+   testpmd> set fwd (io|io_retry|mac|mac_retry|macswap|flowgen| \
  rxonly|txonly|csum|icmpecho)

 The available information categories are:
@@ -258,6 +260,8 @@ The available information categories are:
   This is the fastest possible forwarding operation as it does not access 
packets data.
   This is the default mode.

+* ``io_retry``: Forwards packets "as-is" in I/O retry mode.
+
 * ``mac``: Changes the source and the destination Ethernet addresses of 
packets before forwarding them.

 * ``mac_retry``: Same as "mac" forwarding mode, but includes retries if the 
destination queue is full.
-- 
2.5.0



[dpdk-dev] [PATCH 5/6] testpmd: show topology at forwarding start

2016-05-05 Thread Zhihong Wang
This patch show topology at forwarding start.

"show config fwd" also does this, but showing it directly can reduce the
possibility of misconfiguration.


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/testpmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index b9c8db9..ef72a93 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1003,7 +1003,7 @@ start_packet_forwarding(int with_tx_first)
if(!no_flush_rx)
flush_fwd_rx_queues();

-   fwd_config_setup();
+   fwd_config_display();
rxtx_config_display();

for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++) {
-- 
2.5.0



[dpdk-dev] [PATCH 4/6] testpmd: handle all rxqs in rss setup

2016-05-05 Thread Zhihong Wang
This patch removes constraints in rxq handling when multiqueue is enabled
to handle all the rxqs.

Current testpmd forces a dedicated core for each rxq, some rxqs may be
ignored when core number is less than rxq number, and that causes confusion
and inconvenience.


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/config.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index bb0b542..8ab2963 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1193,19 +1193,13 @@ rss_fwd_config_setup(void)
cur_fwd_config.nb_fwd_ports = nb_fwd_ports;
cur_fwd_config.nb_fwd_streams =
(streamid_t) (nb_q * cur_fwd_config.nb_fwd_ports);
-   if (cur_fwd_config.nb_fwd_streams > cur_fwd_config.nb_fwd_lcores)
-   cur_fwd_config.nb_fwd_streams =
-   (streamid_t)cur_fwd_config.nb_fwd_lcores;
-   else
-   cur_fwd_config.nb_fwd_lcores =
-   (lcoreid_t)cur_fwd_config.nb_fwd_streams;

/* reinitialize forwarding streams */
init_fwd_streams();

setup_fwd_config_of_each_lcore(_fwd_config);
rxp = 0; rxq = 0;
-   for (lc_id = 0; lc_id < cur_fwd_config.nb_fwd_lcores; lc_id++) {
+   for (lc_id = 0; lc_id < cur_fwd_config.nb_fwd_streams; lc_id++) {
struct fwd_stream *fs;

fs = fwd_streams[lc_id];
-- 
2.5.0



[dpdk-dev] [PATCH 3/6] testpmd: show throughput in port stats

2016-05-05 Thread Zhihong Wang
This patch adds throughput numbers (in the period since last use of this
command) in port statistics display for "show port stats (port_id|all)".


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/config.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 1c552e4..bb0b542 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -92,6 +92,7 @@
 #include 
 #include 
 #include 
+#include 

 #include "testpmd.h"

@@ -150,6 +151,10 @@ print_ethaddr(const char *name, struct ether_addr 
*eth_addr)
 void
 nic_stats_display(portid_t port_id)
 {
+   static uint64_t sum_rx[RTE_MAX_ETHPORTS];
+   static uint64_t sum_tx[RTE_MAX_ETHPORTS];
+   static uint64_t cycles[RTE_MAX_ETHPORTS];
+   uint64_t pkt_rx, pkt_tx, cycle;
struct rte_eth_stats stats;
struct rte_port *port = [port_id];
uint8_t i;
@@ -209,6 +214,21 @@ nic_stats_display(portid_t port_id)
}
}

+   cycle = cycles[port_id];
+   cycles[port_id] = rte_rdtsc();
+   if (cycle > 0)
+   cycle = cycles[port_id] - cycle;
+
+   pkt_rx = stats.ipackets - sum_rx[port_id];
+   pkt_tx = stats.opackets - sum_tx[port_id];
+   sum_rx[port_id] = stats.ipackets;
+   sum_tx[port_id] = stats.opackets;
+   printf("\n  Throughput (since last show)\n");
+   printf("  RX-pps: %12"PRIu64"\n"
+   "  TX-pps: %12"PRIu64"\n",
+   cycle > 0 ? pkt_rx * rte_get_tsc_hz() / cycle : 0,
+   cycle > 0 ? pkt_tx * rte_get_tsc_hz() / cycle : 0);
+
printf("  %s%s\n",
   nic_stats_border, nic_stats_border);
 }
-- 
2.5.0



[dpdk-dev] [PATCH 2/6] testpmd: configurable tx_first burst number

2016-05-05 Thread Zhihong Wang
This patch enables configurable tx_first burst number.

Use "start tx_first (burst_num)" to specify how many bursts of packets to
be sent before forwarding start, or "start tx_first" like before for the
default 1 burst send.


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/cmdline.c | 41 +
 app/test-pmd/testpmd.c |  7 +--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index c5b9479..8f78cc6 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -5240,6 +5240,46 @@ cmdline_parse_inst_t cmd_start_tx_first = {
},
 };

+/* *** START FORWARDING WITH N TX BURST FIRST *** */
+struct cmd_start_tx_first_n_result {
+   cmdline_fixed_string_t start;
+   cmdline_fixed_string_t tx_first;
+   uint32_t tx_num;
+};
+
+static void
+cmd_start_tx_first_n_parsed(__attribute__((unused)) void *parsed_result,
+ __attribute__((unused)) struct cmdline *cl,
+ __attribute__((unused)) void *data)
+{
+   struct cmd_start_tx_first_n_result *res = parsed_result;
+
+   start_packet_forwarding(res->tx_num);
+}
+
+cmdline_parse_token_string_t cmd_start_tx_first_n_start =
+   TOKEN_STRING_INITIALIZER(struct cmd_start_tx_first_n_result,
+   start, "start");
+cmdline_parse_token_string_t cmd_start_tx_first_n_tx_first =
+   TOKEN_STRING_INITIALIZER(struct cmd_start_tx_first_n_result,
+   tx_first, "tx_first");
+cmdline_parse_token_num_t cmd_start_tx_first_n_tx_num =
+   TOKEN_NUM_INITIALIZER(struct cmd_start_tx_first_n_result,
+   tx_num, UINT32);
+
+cmdline_parse_inst_t cmd_start_tx_first_n = {
+   .f = cmd_start_tx_first_n_parsed,
+   .data = NULL,
+   .help_str = "start packet forwarding, after sending  "
+   "bursts of packets",
+   .tokens = {
+   (void *)_start_tx_first_n_start,
+   (void *)_start_tx_first_n_tx_first,
+   (void *)_start_tx_first_n_tx_num,
+   NULL,
+   },
+};
+
 /* *** SET LINK UP *** */
 struct cmd_set_link_up_result {
cmdline_fixed_string_t set;
@@ -10399,6 +10439,7 @@ cmdline_parse_ctx_t main_ctx[] = {
(cmdline_parse_inst_t *)_showcfg,
(cmdline_parse_inst_t *)_start,
(cmdline_parse_inst_t *)_start_tx_first,
+   (cmdline_parse_inst_t *)_start_tx_first_n,
(cmdline_parse_inst_t *)_set_link_up,
(cmdline_parse_inst_t *)_set_link_down,
(cmdline_parse_inst_t *)_reset,
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 61abcf8..b9c8db9 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1037,8 +1037,11 @@ start_packet_forwarding(int with_tx_first)
for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++)
(*port_fwd_begin)(fwd_ports_ids[i]);
}
-   launch_packet_forwarding(run_one_txonly_burst_on_core);
-   rte_eal_mp_wait_lcore();
+   while (with_tx_first--) {
+   launch_packet_forwarding(
+   run_one_txonly_burst_on_core);
+   rte_eal_mp_wait_lcore();
+   }
port_fwd_end = tx_only_engine.port_fwd_end;
if (port_fwd_end != NULL) {
for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++)
-- 
2.5.0



[dpdk-dev] [PATCH 1/6] testpmd: add io_retry forwarding

2016-05-05 Thread Zhihong Wang
This patch adds an io_retry-fwd in testpmd to prevent most packet
losses. It can be enabled by "set fwd io_retry".

io-fwd is the fastest possible forwarding engine, good for basic
performance test. Adding retry mechanism expands test case coverage
to support scenarios where packet loss affects test results.


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/Makefile  |   1 +
 app/test-pmd/iofwd-retry.c | 139 +
 app/test-pmd/testpmd.c |   1 +
 app/test-pmd/testpmd.h |   1 +
 4 files changed, 142 insertions(+)
 create mode 100644 app/test-pmd/iofwd-retry.c

diff --git a/app/test-pmd/Makefile b/app/test-pmd/Makefile
index 72426f3..a6735cf 100644
--- a/app/test-pmd/Makefile
+++ b/app/test-pmd/Makefile
@@ -49,6 +49,7 @@ SRCS-y += parameters.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline.c
 SRCS-y += config.c
 SRCS-y += iofwd.c
+SRCS-y += iofwd-retry.c
 SRCS-y += macfwd.c
 SRCS-y += macfwd-retry.c
 SRCS-y += macswap.c
diff --git a/app/test-pmd/iofwd-retry.c b/app/test-pmd/iofwd-retry.c
new file mode 100644
index 000..14c5660
--- /dev/null
+++ b/app/test-pmd/iofwd-retry.c
@@ -0,0 +1,139 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "testpmd.h"
+
+#define TX_RETRY_TIMES 128
+#define TX_RETRY_WAIT_US 1
+
+/*
+ * I/O retry forwarding mode.
+ * Forward packets "as-is" without access to packet data.
+ */
+static void
+pkt_burst_io_retry_forward(struct fwd_stream *fs)
+{
+   struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
+   uint16_t nb_rx;
+   uint16_t nb_tx;
+   uint32_t retry;
+
+#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
+   uint64_t start_tsc;
+   uint64_t end_tsc;
+   uint64_t core_cycles;
+#endif
+
+#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
+   start_tsc = rte_rdtsc();
+#endif
+
+   nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
+   pkts_burst, nb_pkt_per_burst);
+   if (unlikely(nb_rx == 0))
+   return;
+   fs->rx_packets += nb_rx;
+
+#ifdef RTE_TEST_PMD_RECORD_BURST_STATS
+   fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
+#endif
+
+   retry = 0;
+   nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+   pkts_burst, nb_rx);
+   while (unlikely(nb_tx < nb_rx) && (retry++ < TX_RETRY_TIMES)) {
+   rte_delay_us(TX_RETRY_WAIT_US);
+   nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
+   _burst[nb_tx], nb_rx - nb_tx);
+   }
+   fs->tx_packets += nb_tx;
+
+#ifdef RTE_TEST_PMD_RECORD_BURST_STATS
+   fs->tx_burst_stats.pkt_burst_spread[nb_tx]++;
+#endif
+
+   if (unlikely(nb_tx < nb_rx)) {
+   fs->fwd_dropped += (nb_rx - nb_tx);
+   do {
+   rte_pktmbuf_free(pkts_burst[nb_tx]);
+   } while (++nb_tx < nb_rx);
+   }

[dpdk-dev] [PATCH 0/6] vhost/virtio performance loopback utility

2016-05-05 Thread Zhihong Wang
This patch enables vhost/virtio pmd performance loopback test in testpmd.
All the features are for general usage.

The loopback test focuses on the maximum full-path packet forwarding
performance between host and guest, it runs vhost/virtio pmd only without
introducing extra overhead.

Therefore, the main requirement is traffic generation, since there's no
other packet generators like IXIA to help.

In current testpmd, io-fwd is the ideal candidate to perform this loopback
test because it's the fastest possible forwarding engine: Start testpmd
io-fwd in host with 1 vhost pmd port, and start testpmd io-fwd in the
connected guest with 1 corresponding virtio pmd port, and these 2 ports
form a forwarding loop, packets received by the host vhost pmd port are
forwarded to the guest virtio pmd port, and packets received by the guest
virtio pmd port are sent to the host vhost pmd port.

As to traffic generation, "start tx_first" injects a burst of packets into
the loop, which is the ideal way to do that.

However 2 issues remain:

   1. If only 1 burst of packets are injected in the loop, there will
  almost definitely be empty rx operations, e.g. When guest virtio pmd
  port send burst to the host, then it starts the rx immediately, it's
  likely the packets are still being forwarded by host vhost pmd port
  and haven't reached the guest yet.

  We need to fill up the ring to keep all pmds busy.

   2. io-fwd doesn't provide retry mechanism, so if packet loss occurs,
  there won't be a full burst in the loop.

To address these issues, this patch:

   1. Add an io_retry-fwd in testpmd to prevent most packet losses.

   2. Add parameter to enable configurable tx_first burst number.

Other related improvements include:

   1. Handle all rxqs when multiqueue is enabled: Current testpmd forces a
  single core for each rxq which causes inconvenience and confusion.

   2. Show topology at forwarding start: "show config fwd" also does this,
  but show it directly can reduce the possibility of mis-configuration.

   3. Add throughput information in port statistics display for "show port
  stats (port_id|all)".

Finally there's documentation update.

Example on how to enable vhost/virtio performance loopback test:

   1. Start testpmd in host with 1 vhost pmd port only.

   2. Start testpmd in guest with only 1 virtio pmd port connected to the
  corresponding vhost pmd port.

   3. "set fwd io_retry" in testpmds in both host and guest.

   4. "start" in testpmd in guest.

   5. "start tx_first 8" in testpmd in host.

Then use "show port stats all" to monitor the performance.


Zhihong Wang (6):
  testpmd: add io_retry forwarding
  testpmd: configurable tx_first burst number
  testpmd: show throughput in port stats
  testpmd: handle all rxqs in rss setup
  testpmd: show topology at forwarding start
  testpmd: update documentation

 app/test-pmd/Makefile   |   1 +
 app/test-pmd/cmdline.c  |  41 
 app/test-pmd/config.c   |  28 --
 app/test-pmd/iofwd-retry.c  | 139 
 app/test-pmd/testpmd.c  |  10 +-
 app/test-pmd/testpmd.h  |   1 +
 doc/guides/testpmd_app_ug/run_app.rst   |   1 +
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  10 +-
 8 files changed, 218 insertions(+), 13 deletions(-)
 create mode 100644 app/test-pmd/iofwd-retry.c

-- 
2.5.0



[dpdk-dev] [PATCH] doc: virtio pmd versions

2016-04-21 Thread Zhihong Wang
This patch explains all the versions of current virtio pmd implementation,
what's the difference, and how to choose the right version.

Signed-off-by: Zhihong Wang 
---
 doc/guides/nics/virtio.rst | 57 ++
 1 file changed, 57 insertions(+)

diff --git a/doc/guides/nics/virtio.rst b/doc/guides/nics/virtio.rst
index 06ca433..43ba686 100644
--- a/doc/guides/nics/virtio.rst
+++ b/doc/guides/nics/virtio.rst
@@ -211,3 +211,60 @@ In this example, the packet reception flow path is:
 The packet transmission flow is:

 IXIA packet generator-> Guest VM 82599 VF port1 rx burst-> Guest VM virtio 
port 0 tx burst-> tap -> Linux Bridge->82599 PF-> IXIA packet generator
+
+Virtio PMD Versions
+---
+
+Virtio driver has 3 versions of rx functions and 2 versions of tx functions.
+
+RX functions:
+
+*   ``virtio_recv_pkts``:
+
+Regular version without mergeable rx buffers support
+
+*   ``virtio_recv_mergeable_pkts``:
+
+Regular version with mergeable rx buffers support
+
+*   ``virtio_recv_pkts_vec``:
+
+Simple version without mergeable rx buffers support, also fixes the avail 
ring and uses vector instructions to optimize performance
+
+TX functions:
+
+*   ``virtio_xmit_pkts``:
+
+Regular version
+
+*   ``virtio_xmit_pkts_simple``:
+
+Simple version fixes the avail ring to optimize performance
+
+By default, the non-vector versions are used:
+
+*   For rx: If mergeable rx buffers is disabled then ``virtio_recv_pkts`` is 
used; otherwise ``virtio_recv_mergeable_pkts``
+
+*   For tx: ``virtio_xmit_pkts``
+
+Setting ``txq_flags`` to ``VIRTIO_SIMPLE_FLAGS`` (0xf01) enables the simple 
version of virtio poll mode driver:
+
+*   For rx: ``virtio_recv_pkts_vec``
+
+*   For tx: ``virtio_xmit_pkts_simple``
+
+The simple version will only be enabled when:
+
+*   Mergeable rx buffers is disabled
+
+*   Single segment is specified
+
+*   No offload support is needed
+
+Example to use the simple version of virtio poll mode driver in testpmd:
+
+.. code-block:: console
+
+./x86_64-native-linuxapp-gcc/app/testpmd -c 0x7 -n 4
+--  -i --txqflags=0xf01 --rxq=1 --txq=1 --nb-cores=1
+
-- 
2.5.0



[dpdk-dev] [RFC PATCH 2/2] testpmd: add portfwd commands

2016-04-20 Thread Zhihong Wang
This patch adds command support for portfwd, to enable run time
configuration.

Command details:

   1) set fwd port

  switch forwarding engine to portfwd

   2) show route

  show port info and forwarding rules for portfwd

   3) set route  

  packets from  will be dispatched to 

   4) set route  ip

  packets from  will be dispatched based on dst ip

   5) set ip 

  set ip addr for , portfwd will use this ip addr to do ip
  route

   6) set affinity  

  forwarding stream  will be handled by core 
  (info can be read from "show route")

   7) show port perf all

  show perf stats (rx/tx cycles, burst size distribution, tx pktloss)
  of each port

   8) set drain 

  set drain interval to drain buffered packets which is not sent
  because buffer not full (0 to disable)

Below are 3 examples to show how to use portfwd to build traffic flow in
the host (Guest traffic can be built likewise):

   1) PVP test: NIC-VM-NIC
  *  2 VMs each with 2 vhost ports: port 0, 1 and 2, 3
  *  1 NIC with 2 ports: port 4, 5
  *  Traffic from 4 goes to 0 and 2, and back from 1, 3 to 5
  Commands:
 set fwd port
 set ip 0 192 168 1 1
 set ip 2 192 168 1 2
 set route 4 ip (Make sure traffic has the right dst ip)
 set route 1 5
 set route 3 5
 set drain 0
 show route

   2) PVVP test: NIC-VM-VM-NIC
  *  2 VMs each with 2 vhost ports: port 0, 1 and 2, 3
  *  1 NIC with 2 ports: port 4, 5
  *  Traffic from 4 goes to 0, and 1 to 2, finally 3 to 5
  Commands:
 set fwd port
 set route 4 0
 set route 1 2
 set route 3 5
 set drain 0
 show route

   3) PVP bi-directional test: NIC-VM-NIC
  *  1 VM with 2 vhost ports: port 0, 1
  *  1 NIC with 2 ports: port 2, 3
  *  Traffic from 0 to 2, 1 to 3, and 2 to 0, 3 to 1
  Commands:
 set fwd port
 set route 0 2
 set route 2 0
 set route 1 3
 set route 3 1
 set drain 0
 show route


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/cmdline.c | 279 -
 1 file changed, 277 insertions(+), 2 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index c5b9479..6a076a4 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -187,6 +187,9 @@ static void cmd_help_long_parsed(void *parsed_result,
"show port (info|stats|xstats|fdir|stat_qmap|dcb_tc) 
(port_id|all)\n"
"Display information for port_id, or all.\n\n"

+   "show port perf all\n"
+   "Display performance information for all.\n\n"
+
"show port X rss reta (size) (mask0,mask1,...)\n"
"Display the rss redirection table entry indicated"
" by masks on port X. size is used to indicate the"
@@ -5401,6 +5404,9 @@ static void cmd_showportall_parsed(void *parsed_result,
else if (!strcmp(res->what, "dcb_tc"))
FOREACH_PORT(i, ports)
port_dcb_info_display(i);
+   else if (!strcmp(res->what, "perf"))
+   if (cur_fwd_eng == _fwd_engine)
+   print_perf_stats();
 }

 cmdline_parse_token_string_t cmd_showportall_show =
@@ -5410,13 +5416,14 @@ cmdline_parse_token_string_t cmd_showportall_port =
TOKEN_STRING_INITIALIZER(struct cmd_showportall_result, port, "port");
 cmdline_parse_token_string_t cmd_showportall_what =
TOKEN_STRING_INITIALIZER(struct cmd_showportall_result, what,
-"info#stats#xstats#fdir#stat_qmap#dcb_tc");
+
"info#stats#xstats#fdir#stat_qmap#dcb_tc#perf");
 cmdline_parse_token_string_t cmd_showportall_all =
TOKEN_STRING_INITIALIZER(struct cmd_showportall_result, all, "all");
 cmdline_parse_inst_t cmd_showportall = {
.f = cmd_showportall_parsed,
.data = NULL,
-   .help_str = "show|clear port info|stats|xstats|fdir|stat_qmap|dcb_tc 
all",
+   .help_str = "show|clear port info|stats|xstats|fdir|stat_qmap|"
+   "dcb_tc|perf all",
.tokens = {
(void *)_showportall_show,
(void *)_showportall_port,
@@ -9725,6 +9732,268 @@ cmdline_parse_inst_t cmd_mcast_addr = {
},
 };

+/* *** SHOW ROUTE *** */
+struct cmd_show_route_result {
+   cmdline_fixed_string_t show;
+   cmdline_fixed_string_t route;
+};
+
+static void cmd_show_route_parsed(
+   __attribute__((unused)) void *parsed_result,
+   __attribute__((unused)) struct cmdline *cl,
+   __attribute__((unused)) void *data)
+{

[dpdk-dev] [RFC PATCH 1/2] testpmd: add portfwd engine

2016-04-20 Thread Zhihong Wang
This patch implements a general purpose forwarding engine in testpmd namely
"portfwd", to enable performance analysis and tuning for poll mode drivers
in vSwitching scenarios.

Features of portfwd:

   1) Build up traffic from simple rx/tx to complex scenarios easily

   2) Rich performance statistics for all ports

   3) Core affinity manipulation

   4) Commands for run time configuration

To enable flexible traffic flow setup, each port has 2 ways to forward
packets in portfwd:

   1) Forward based on dst ip

  For ip based forwarding, portfwd scans each packet to get the dst ip
  for dst port mapping.

  A simple suffix mapping method is used for dst ip based forwarding, a
  macro IPV4_ROUTE_MASK is used to specify how many (last) bits of dst
  ip will be used for hashing.

  It is recommended to make sure there's no conflict by setting proper
  IPV4_ROUTE_MASK and/or different ip ends for each port, otherwise it
  may hurt performance.

   2) Forward to a fixed port

  For fixed port forwarding, portfwd still scans each packet on purpose
  to simulate the impact of packet analysis behavior in real scenarios.

After dst ports are identified, packets are enqueued to a buffer which will
be burst sent when full. Packet buffers are built at each src port, so no
contention at enqueue stage.

There is a timeout interval to drain all buffers, which can be configured
or disabled.

Spinlock is used at dst port & queue to solve conflicts.

Notice that portfwd has fair performance, but it's not for getting the
"maximum" numbers:

   1) It buffers packets for burst send efficiency analysis, which increase
   latency

   2) It touches the packet header and collect performance statistics which
   adds overheads

These "extra" overheads are actually what happens in real applications.

Modifications are:
   1) Add the portfwd engine in portfwd.c
   2) Add related data structures
   3) Add support functions


Signed-off-by: Zhihong Wang 
---
 app/test-pmd/Makefile  |   1 +
 app/test-pmd/config.c  | 408 ++-
 app/test-pmd/portfwd.c | 418 +
 app/test-pmd/testpmd.c |  19 +++
 app/test-pmd/testpmd.h |  62 
 5 files changed, 900 insertions(+), 8 deletions(-)
 create mode 100644 app/test-pmd/portfwd.c

diff --git a/app/test-pmd/Makefile b/app/test-pmd/Makefile
index 72426f3..0352feb 100644
--- a/app/test-pmd/Makefile
+++ b/app/test-pmd/Makefile
@@ -49,6 +49,7 @@ SRCS-y += parameters.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline.c
 SRCS-y += config.c
 SRCS-y += iofwd.c
+SRCS-y += portfwd.c
 SRCS-y += macfwd.c
 SRCS-y += macfwd-retry.c
 SRCS-y += macswap.c
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index b1bbec6..9754229 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -92,6 +92,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 

 #include "testpmd.h"

@@ -150,6 +152,11 @@ print_ethaddr(const char *name, struct ether_addr 
*eth_addr)
 void
 nic_stats_display(portid_t port_id)
 {
+   static uint64_t cnt_rx[RTE_MAX_ETHPORTS];
+   static uint64_t cnt_tx[RTE_MAX_ETHPORTS];
+   static uint64_t cycle[RTE_MAX_ETHPORTS];
+   uint64_t crx, ctx, c;
+
struct rte_eth_stats stats;
struct rte_port *port = [port_id];
uint8_t i;
@@ -209,6 +216,20 @@ nic_stats_display(portid_t port_id)
}
}

+   c = cycle[port_id];
+   cycle[port_id] = rte_rdtsc();
+   if (c > 0)
+   c = cycle[port_id] - c;
+
+   crx = stats.ipackets - cnt_rx[port_id];
+   ctx = stats.opackets - cnt_tx[port_id];
+   cnt_rx[port_id] = stats.ipackets;
+   cnt_tx[port_id] = stats.opackets;
+   printf("  Throughput (since last show):\n");
+   printf("  RX PPS: %12"PRIu64"\n  TX PPS: %12"PRIu64"\n",
+   c > 0 ? crx * rte_get_tsc_hz() / c : 0,
+   c > 0 ? ctx * rte_get_tsc_hz() / c : 0);
+
printf("  %s%s\n",
   nic_stats_border, nic_stats_border);
 }
@@ -1087,6 +1108,178 @@ setup_fwd_config_of_each_lcore(struct fwd_config *cfg)
 }

 static void
+copy_fwd_stream(struct fwd_stream *src, struct fwd_stream *dst)
+{
+   rte_memcpy(dst, src, sizeof(struct fwd_stream));
+}
+
+int
+set_fwd_stream_affinity(unsigned int idx, unsigned int core)
+{
+   struct fwd_stream **fwd_streams_tmp;
+   struct fwd_stream *fs;
+   unsigned int lc_id_dst;
+   unsigned int lc_id_src;
+   unsigned int fs_id;
+   unsigned int i, j, ci, cj;
+
+   if (cur_fwd_eng != _fwd_engine)
+   return 0;
+   if (test_done == 0) {
+   printf("please stop forwarding first\n");
+   return 0;
+   }
+   for (i = 0; i < cur_fwd_config.nb_fwd_lcores; i++) {
+   

[dpdk-dev] [RFC PATCH 0/2] performance utility in testpmd

2016-04-20 Thread Zhihong Wang
 drain 0
 show route

For the PVP bi-directional test, host testpmd can be launched like:

./x86_64-native-linuxapp-gcc/app/testpmd -c 0xf0 -n 4 --socket-mem 4096,0
   --vdev 'eth_vhost0,iface=/tmp/sock0,queues=2'
   --vdev 'eth_vhost1,iface=/tmp/sock1,queues=2'
   -- -i --rxq=2 --txq=2 --rss-ip --nb-cores=2


Zhihong Wang (2):
  testpmd: add portfwd engine
  testpmd: add porfwd commands

 app/test-pmd/Makefile  |   1 +
 app/test-pmd/cmdline.c | 279 -
 app/test-pmd/config.c  | 408 ++-
 app/test-pmd/portfwd.c | 418 +
 app/test-pmd/testpmd.c |  19 +++
 app/test-pmd/testpmd.h |  62 
 6 files changed, 1177 insertions(+), 10 deletions(-)
 create mode 100644 app/test-pmd/portfwd.c

-- 
2.5.0



[dpdk-dev] [PATCH] eal/x86: Fix build with clang for old AVX

2016-02-03 Thread Zhihong Wang
When configuring RTE_MACHINE to "default", rte_memcpy implementation
is the default one (old AVX).
In this code, clang raises a warning thanks to -Wsometimes-uninitialized:

rte_memcpy.h:838:6: error:
variable 'srcofs' is used uninitialized whenever 'if' condition is false
if (dstofss > 0) {
^~~
rte_memcpy.h:849:6: note: uninitialized use occurs here
if (srcofs == 0) {
^~

It is fixed by moving srcofs initialization out of the condition.
Also dstofss calculation is corrected.

Fixes: 1ae817f9f887 ("eal/x86: tune memcpy for platforms without AVX512")

Signed-off-by: Zhihong Wang 
Reported-by: De Lara Guarch, Pablo 
---
 lib/librte_eal/common/include/arch/x86/rte_memcpy.h | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index 8e2c53c..f463ab3 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -512,8 +512,9 @@ COPY_BLOCK_64_BACK31:
/**
 * Make store aligned when copy size exceeds 512 bytes
 */
-   dstofss = 32 - ((uintptr_t)dst & 0x1F);
+   dstofss = (uintptr_t)dst & 0x1F;
if (dstofss > 0) {
+   dstofss = 32 - dstofss;
n -= dstofss;
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + dstofss;
@@ -834,14 +835,15 @@ COPY_BLOCK_64_BACK15:
 * unaligned copy functions require up to 15 bytes
 * backwards access.
 */
-   dstofss = 16 - ((uintptr_t)dst & 0x0F) + 16;
+   dstofss = (uintptr_t)dst & 0x0F;
if (dstofss > 0) {
+   dstofss = 16 - dstofss + 16;
n -= dstofss;
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
src = (const uint8_t *)src + dstofss;
dst = (uint8_t *)dst + dstofss;
-   srcofs = ((uintptr_t)src & 0x0F);
}
+   srcofs = ((uintptr_t)src & 0x0F);

/**
 * For aligned copy
-- 
2.5.0



[dpdk-dev] [dpdk-dev,v2] Clean up rte_memcpy.h file

2016-01-27 Thread Zhihong Wang
> Remove unnecessary type casting in functions.
> 
> Tested on Ubuntu (14.04 x86_64) with "make test".
> "make test" results match the results with baseline.
> "Memcpy perf" results match the results with baseline.
> 
> Signed-off-by: Ravi Kerur 
> Acked-by: Stephen Hemminger 
> 
> ---
> .../common/include/arch/x86/rte_memcpy.h   | 340 +++--
>  1 file changed, 175 insertions(+), 165 deletions(-)
> 
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
> b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> index 6a57426..839d4ec 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h

[...]

>  /**
> @@ -150,13 +150,16 @@ rte_mov64blocks(uint8_t *dst, const uint8_t *src, 
> size_t n)
>   __m256i ymm0, ymm1;
>  
>   while (n >= 64) {
> - ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
> *)src + 0 * 32));
> +
> + ymm0 = _mm256_loadu_si256((const __m256i *)(src + 0 * 32));
> + ymm1 = _mm256_loadu_si256((const __m256i *)(src + 1 * 32));
> +
> + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), ymm0);
> + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), ymm1);
> +

Any particular reason to change the order of the statements here? :)
Overall this patch looks good.

>   n -= 64;
> - ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
> *)src + 1 * 32));
> - src = (const uint8_t *)src + 64;
> - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
> - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
> - dst = (uint8_t *)dst + 64;
> + src = src + 64;
> + dst = dst + 64;
>   }
>  }
>  



[dpdk-dev] [dpdk-dev, v3] Implement memcmp using Intel SIMD instrinsics.

2016-01-27 Thread Zhihong Wang
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcmp.h b/lib
> /librte_eal/common/include/arch/x86/rte_memcmp.h

[...]

> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * Compare bytes between two locations. The locations must not overlap.
> + *

Parameter names should be kept consistent as they are in function body.

> + * @param src_1
> + *   Pointer to the first source of the data.
> + * @param src_2
> + *   Pointer to the second source of the data.
> + * @param n
> + *   Number of bytes to compare.
> + * @return
> + *   zero if src_1 equal src_2
> + *   -ve if src_1 less than src_2
> + *   +ve if src_1 greater than src_2
> + */
> +static inline int
> +rte_memcmp(const void *src_1, const void *src,
> + size_t n) __attribute__((always_inline));
> +
> +/**
> + * Find the first different bit for comparison.
> + */
> +static inline int
> +rte_cmpffd (uint32_t x, uint32_t y)
> +{
> + int i;
> + int pos = x ^ y;
> + for (i = 0; i < 32; i++)
> + if (pos & (1< + return i;
> + return -1;
> +}
> +

[...]

> +/**
> + * Compare 48 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp48(const void *src_1, const void *src_2)

Guess this is not used.

[...]

> +/**
> + * Compare 256 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp256(const void *src_1, const void *src_2)
> +{
> + int ret;
> +
> + ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
> + (const uint8_t *)src_2 + 0 * 64);

Why not just use rte_cmp128?


[...]

> +static inline int
> +rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
> +{
> + const uint8_t *src_1 = (const uint8_t *)_src_1;
> + const uint8_t *src_2 = (const uint8_t *)_src_2;
> + int ret = 0;
> +
> + if (n < 16)
> + return rte_memcmp_regular(src_1, src_2, n);
> +
> + if (n <= 32) {
> + ret = rte_cmp16(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }
> +

Too many conditions here may harm the overall performance.
It's a trade-off thing, all about balancing the overhead.
Just make sure this is tuned based on actual test numbers.


> + if (n <= 48) {
> + ret = rte_cmp32(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }
> +
> + if (n <= 64) {
> + ret = rte_cmp32(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp16(src_1 + 32, src_2 + 32);
> +
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }
> +
> + if (n <= 96) {
> + ret = rte_cmp64(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp16(src_1 + 64, src_2 + 64);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }
> +
> + if (n <= 128) {
> + ret = rte_cmp64(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp32(src_1 + 64, src_2 + 64);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp16(src_1 + 96, src_2 + 96);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }

[...]

> +/**
> + * Compare 48 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp48(const void *src_1, const void *src_2)

Not used.

> +{
> + int ret;
> +
> + ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> + (const uint8_t *)src_2 + 0 * 16);
> +
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
> + (const uint8_t *)src_2 + 1 * 16);
> +
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16((const uint8_t *)src_1 + 2 * 16,
> + (const uint8_t *)src_2 + 2 * 16);
> +}
> +
> +/**
> + * Compare 64 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp64(const void *src_1, const void *src_2)
> +{
> + int ret;
> +
> + ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> + (const uint8_t *)src_2 + 0 * 16);

Why not rte_cmp32? And use rte_cmp64 for rte_cmp128, and so 

[dpdk-dev] [PATCH v2 5/5] lib/librte_eal: Tune memcpy for prior platforms

2016-01-17 Thread Zhihong Wang
For prior platforms, add condition for unalignment handling, to keep this
operation from interrupting the batch copy loop for aligned cases.

Signed-off-by: Zhihong Wang 
---
 .../common/include/arch/x86/rte_memcpy.h   | 22 +-
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index fee954a..d965957 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -513,10 +513,12 @@ COPY_BLOCK_64_BACK31:
 * Make store aligned when copy size exceeds 512 bytes
 */
dstofss = 32 - ((uintptr_t)dst & 0x1F);
-   n -= dstofss;
-   rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-   src = (const uint8_t *)src + dstofss;
-   dst = (uint8_t *)dst + dstofss;
+   if (dstofss > 0) {
+   n -= dstofss;
+   rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+   src = (const uint8_t *)src + dstofss;
+   dst = (uint8_t *)dst + dstofss;
+   }

/**
 * Copy 256-byte blocks.
@@ -833,11 +835,13 @@ COPY_BLOCK_64_BACK15:
 * backwards access.
 */
dstofss = 16 - ((uintptr_t)dst & 0x0F) + 16;
-   n -= dstofss;
-   rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-   src = (const uint8_t *)src + dstofss;
-   dst = (uint8_t *)dst + dstofss;
-   srcofs = ((uintptr_t)src & 0x0F);
+   if (dstofss > 0) {
+   n -= dstofss;
+   rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+   src = (const uint8_t *)src + dstofss;
+   dst = (uint8_t *)dst + dstofss;
+   srcofs = ((uintptr_t)src & 0x0F);
+   }

/**
 * For aligned copy
-- 
2.5.0



[dpdk-dev] [PATCH v2 4/5] app/test: Adjust alignment unit for memcpy perf test

2016-01-17 Thread Zhihong Wang
Decide alignment unit for memcpy perf test based on predefined macros.

Signed-off-by: Zhihong Wang 
---
 app/test/test_memcpy_perf.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/app/test/test_memcpy_perf.c b/app/test/test_memcpy_perf.c
index 754828e..73babec 100644
--- a/app/test/test_memcpy_perf.c
+++ b/app/test/test_memcpy_perf.c
@@ -79,7 +79,13 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
 #define TEST_BATCH_SIZE 100

 /* Data is aligned on this many bytes (power of 2) */
+#ifdef RTE_MACHINE_CPUFLAG_AVX512F
+#define ALIGNMENT_UNIT  64
+#elif RTE_MACHINE_CPUFLAG_AVX2
 #define ALIGNMENT_UNIT  32
+#else /* RTE_MACHINE_CPUFLAG */
+#define ALIGNMENT_UNIT  16
+#endif /* RTE_MACHINE_CPUFLAG */

 /*
  * Pointers used in performance tests. The two large buffers are for uncached
-- 
2.5.0



[dpdk-dev] [PATCH v2 3/5] lib/librte_eal: Optimize memcpy for AVX512 platforms

2016-01-17 Thread Zhihong Wang
Implement AVX512 memcpy and choose the right implementation based on
predefined macros, to make full utilization of hardware resources and
deliver high performance.

In current DPDK, memcpy holds a large proportion of execution time in
libs like Vhost, especially for large packets, and this patch can bring
considerable benefits for AVX512 platforms.

The implementation is based on the current DPDK memcpy framework, some
background introduction can be found in these threads:
http://dpdk.org/ml/archives/dev/2014-November/008158.html
http://dpdk.org/ml/archives/dev/2015-January/011800.html

Signed-off-by: Zhihong Wang 
---
 .../common/include/arch/x86/rte_memcpy.h   | 247 -
 1 file changed, 243 insertions(+), 4 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index 6a57426..fee954a 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -37,7 +37,7 @@
 /**
  * @file
  *
- * Functions for SSE/AVX/AVX2 implementation of memcpy().
+ * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  */

 #include 
@@ -67,7 +67,246 @@ extern "C" {
 static inline void *
 rte_memcpy(void *dst, const void *src, size_t n) 
__attribute__((always_inline));

-#ifdef RTE_MACHINE_CPUFLAG_AVX2
+#ifdef RTE_MACHINE_CPUFLAG_AVX512F
+
+/**
+ * AVX512 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+   __m128i xmm0;
+
+   xmm0 = _mm_loadu_si128((const __m128i *)src);
+   _mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+   __m256i ymm0;
+
+   ymm0 = _mm256_loadu_si256((const __m256i *)src);
+   _mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+   __m512i zmm0;
+
+   zmm0 = _mm512_loadu_si512((const void *)src);
+   _mm512_storeu_si512((void *)dst, zmm0);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+   rte_mov64(dst + 0 * 64, src + 0 * 64);
+   rte_mov64(dst + 1 * 64, src + 1 * 64);
+}
+
+/**
+ * Copy 256 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+   rte_mov64(dst + 0 * 64, src + 0 * 64);
+   rte_mov64(dst + 1 * 64, src + 1 * 64);
+   rte_mov64(dst + 2 * 64, src + 2 * 64);
+   rte_mov64(dst + 3 * 64, src + 3 * 64);
+}
+
+/**
+ * Copy 128-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+   __m512i zmm0, zmm1;
+
+   while (n >= 128) {
+   zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+   n -= 128;
+   zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+   src = src + 128;
+   _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+   _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+   dst = dst + 128;
+   }
+}
+
+/**
+ * Copy 512-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+   __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+
+   while (n >= 512) {
+   zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+   n -= 512;
+   zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+   zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
+   zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
+   zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
+   zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
+   zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
+   zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
+   src = src + 512;
+   _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+   _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+   _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
+   _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
+   _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
+   _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
+   _mm512_storeu_si512((void *)(dst

[dpdk-dev] [PATCH v2 2/5] mk: Predefine AVX512 macro for compiler

2016-01-17 Thread Zhihong Wang
Predefine AVX512 macro if AVX512 is enabled by compiler.

Signed-off-by: Zhihong Wang 
---
 mk/rte.cpuflags.mk | 4 
 1 file changed, 4 insertions(+)

diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index 28f203b..19a3e7e 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -89,6 +89,10 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__AVX2__),)
 CPUFLAGS += AVX2
 endif

+ifneq ($(filter $(AUTO_CPUFLAGS),__AVX512F__),)
+CPUFLAGS += AVX512F
+endif
+
 # IBM Power CPU flags
 ifneq ($(filter $(AUTO_CPUFLAGS),__PPC64__),)
 CPUFLAGS += PPC64
-- 
2.5.0



[dpdk-dev] [PATCH v2 1/5] lib/librte_eal: Identify AVX512 CPU flag

2016-01-17 Thread Zhihong Wang
Read CPUID to check if AVX512 is supported by CPU.

Signed-off-by: Zhihong Wang 
---
 lib/librte_eal/common/include/arch/x86/rte_cpuflags.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_cpuflags.h 
b/lib/librte_eal/common/include/arch/x86/rte_cpuflags.h
index dd56553..89c0d9d 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_cpuflags.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_cpuflags.h
@@ -131,6 +131,7 @@ enum rte_cpu_flag_t {
RTE_CPUFLAG_ERMS,   /**< ERMS */
RTE_CPUFLAG_INVPCID,/**< INVPCID */
RTE_CPUFLAG_RTM,/**< Transactional memory */
+   RTE_CPUFLAG_AVX512F,/**< AVX512F */

/* (EAX 8001h) ECX features */
RTE_CPUFLAG_LAHF_SAHF,  /**< LAHF_SAHF */
@@ -238,6 +239,7 @@ static const struct feature_entry cpu_feature_table[] = {
FEAT_DEF(ERMS, 0x0007, 0, RTE_REG_EBX,  8)
FEAT_DEF(INVPCID, 0x0007, 0, RTE_REG_EBX, 10)
FEAT_DEF(RTM, 0x0007, 0, RTE_REG_EBX, 11)
+   FEAT_DEF(AVX512F, 0x0007, 0, RTE_REG_EBX, 16)

FEAT_DEF(LAHF_SAHF, 0x8001, 0, RTE_REG_ECX,  0)
FEAT_DEF(LZCNT, 0x8001, 0, RTE_REG_ECX,  4)
-- 
2.5.0



[dpdk-dev] [PATCH v2 0/5] Optimize memcpy for AVX512 platforms

2016-01-17 Thread Zhihong Wang
This patch set optimizes DPDK memcpy for AVX512 platforms, to make full
utilization of hardware resources and deliver high performance.

In current DPDK, memcpy holds a large proportion of execution time in
libs like Vhost, especially for large packets, and this patch can bring
considerable benefits.

The implementation is based on the current DPDK memcpy framework, some
background introduction can be found in these threads:
http://dpdk.org/ml/archives/dev/2014-November/008158.html
http://dpdk.org/ml/archives/dev/2015-January/011800.html

Code changes are:

  1. Read CPUID to check if AVX512 is supported by CPU

  2. Predefine AVX512 macro if AVX512 is enabled by compiler

  3. Implement AVX512 memcpy and choose the right implementation based on
 predefined macros

  4. Decide alignment unit for memcpy perf test based on predefined macros

--
Changes in v2:

  1. Tune performance for prior platforms

Zhihong Wang (5):
  lib/librte_eal: Identify AVX512 CPU flag
  mk: Predefine AVX512 macro for compiler
  lib/librte_eal: Optimize memcpy for AVX512 platforms
  app/test: Adjust alignment unit for memcpy perf test
  lib/librte_eal: Tune memcpy for prior platforms

 app/test/test_memcpy_perf.c|   6 +
 .../common/include/arch/x86/rte_cpuflags.h |   2 +
 .../common/include/arch/x86/rte_memcpy.h   | 269 -
 mk/rte.cpuflags.mk |   4 +
 4 files changed, 268 insertions(+), 13 deletions(-)

-- 
2.5.0



[dpdk-dev] [PATCH 4/4] app/test: Adjust alignment unit for memcpy perf test

2016-01-14 Thread Zhihong Wang
Decide alignment unit for memcpy perf test based on predefined macros.

Signed-off-by: Zhihong Wang 
---
 app/test/test_memcpy_perf.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/app/test/test_memcpy_perf.c b/app/test/test_memcpy_perf.c
index 754828e..73babec 100644
--- a/app/test/test_memcpy_perf.c
+++ b/app/test/test_memcpy_perf.c
@@ -79,7 +79,13 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
 #define TEST_BATCH_SIZE 100

 /* Data is aligned on this many bytes (power of 2) */
+#ifdef RTE_MACHINE_CPUFLAG_AVX512F
+#define ALIGNMENT_UNIT  64
+#elif RTE_MACHINE_CPUFLAG_AVX2
 #define ALIGNMENT_UNIT  32
+#else /* RTE_MACHINE_CPUFLAG */
+#define ALIGNMENT_UNIT  16
+#endif /* RTE_MACHINE_CPUFLAG */

 /*
  * Pointers used in performance tests. The two large buffers are for uncached
-- 
2.5.0



[dpdk-dev] [PATCH 3/4] lib/librte_eal: Optimize memcpy for AVX512 platforms

2016-01-14 Thread Zhihong Wang
Implement AVX512 memcpy and choose the right implementation based on
predefined macros, to make full utilization of hardware resources and
deliver high performance.

In current DPDK, memcpy holds a large proportion of execution time in
libs like Vhost, especially for large packets, and this patch can bring
considerable benefits for AVX512 platforms.

The implementation is based on the current DPDK memcpy framework, some
background introduction can be found in these threads:
http://dpdk.org/ml/archives/dev/2014-November/008158.html
http://dpdk.org/ml/archives/dev/2015-January/011800.html

Signed-off-by: Zhihong Wang 
---
 .../common/include/arch/x86/rte_memcpy.h   | 247 -
 1 file changed, 243 insertions(+), 4 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index 6a57426..fee954a 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -37,7 +37,7 @@
 /**
  * @file
  *
- * Functions for SSE/AVX/AVX2 implementation of memcpy().
+ * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  */

 #include 
@@ -67,7 +67,246 @@ extern "C" {
 static inline void *
 rte_memcpy(void *dst, const void *src, size_t n) 
__attribute__((always_inline));

-#ifdef RTE_MACHINE_CPUFLAG_AVX2
+#ifdef RTE_MACHINE_CPUFLAG_AVX512F
+
+/**
+ * AVX512 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+   __m128i xmm0;
+
+   xmm0 = _mm_loadu_si128((const __m128i *)src);
+   _mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+   __m256i ymm0;
+
+   ymm0 = _mm256_loadu_si256((const __m256i *)src);
+   _mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+   __m512i zmm0;
+
+   zmm0 = _mm512_loadu_si512((const void *)src);
+   _mm512_storeu_si512((void *)dst, zmm0);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+   rte_mov64(dst + 0 * 64, src + 0 * 64);
+   rte_mov64(dst + 1 * 64, src + 1 * 64);
+}
+
+/**
+ * Copy 256 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+   rte_mov64(dst + 0 * 64, src + 0 * 64);
+   rte_mov64(dst + 1 * 64, src + 1 * 64);
+   rte_mov64(dst + 2 * 64, src + 2 * 64);
+   rte_mov64(dst + 3 * 64, src + 3 * 64);
+}
+
+/**
+ * Copy 128-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+   __m512i zmm0, zmm1;
+
+   while (n >= 128) {
+   zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+   n -= 128;
+   zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+   src = src + 128;
+   _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+   _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+   dst = dst + 128;
+   }
+}
+
+/**
+ * Copy 512-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+   __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+
+   while (n >= 512) {
+   zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+   n -= 512;
+   zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+   zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
+   zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
+   zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
+   zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
+   zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
+   zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
+   src = src + 512;
+   _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+   _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+   _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
+   _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
+   _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
+   _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
+   _mm512_storeu_si512((void *)(dst

[dpdk-dev] [PATCH 2/4] mk: Predefine AVX512 macro for compiler

2016-01-14 Thread Zhihong Wang
Predefine AVX512 macro if AVX512 is enabled by compiler.

Signed-off-by: Zhihong Wang 
---
 mk/rte.cpuflags.mk | 4 
 1 file changed, 4 insertions(+)

diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index 28f203b..19a3e7e 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -89,6 +89,10 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__AVX2__),)
 CPUFLAGS += AVX2
 endif

+ifneq ($(filter $(AUTO_CPUFLAGS),__AVX512F__),)
+CPUFLAGS += AVX512F
+endif
+
 # IBM Power CPU flags
 ifneq ($(filter $(AUTO_CPUFLAGS),__PPC64__),)
 CPUFLAGS += PPC64
-- 
2.5.0



[dpdk-dev] [PATCH 1/4] lib/librte_eal: Identify AVX512 CPU flag

2016-01-14 Thread Zhihong Wang
Read CPUID to check if AVX512 is supported by CPU.

Signed-off-by: Zhihong Wang 
---
 lib/librte_eal/common/include/arch/x86/rte_cpuflags.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_cpuflags.h 
b/lib/librte_eal/common/include/arch/x86/rte_cpuflags.h
index dd56553..89c0d9d 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_cpuflags.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_cpuflags.h
@@ -131,6 +131,7 @@ enum rte_cpu_flag_t {
RTE_CPUFLAG_ERMS,   /**< ERMS */
RTE_CPUFLAG_INVPCID,/**< INVPCID */
RTE_CPUFLAG_RTM,/**< Transactional memory */
+   RTE_CPUFLAG_AVX512F,/**< AVX512F */

/* (EAX 8001h) ECX features */
RTE_CPUFLAG_LAHF_SAHF,  /**< LAHF_SAHF */
@@ -238,6 +239,7 @@ static const struct feature_entry cpu_feature_table[] = {
FEAT_DEF(ERMS, 0x0007, 0, RTE_REG_EBX,  8)
FEAT_DEF(INVPCID, 0x0007, 0, RTE_REG_EBX, 10)
FEAT_DEF(RTM, 0x0007, 0, RTE_REG_EBX, 11)
+   FEAT_DEF(AVX512F, 0x0007, 0, RTE_REG_EBX, 16)

FEAT_DEF(LAHF_SAHF, 0x8001, 0, RTE_REG_ECX,  0)
FEAT_DEF(LZCNT, 0x8001, 0, RTE_REG_ECX,  4)
-- 
2.5.0



[dpdk-dev] [PATCH 0/4] Optimize memcpy for AVX512 platforms

2016-01-14 Thread Zhihong Wang
This patch set optimizes DPDK memcpy for AVX512 platforms, to make full
utilization of hardware resources and deliver high performance.

In current DPDK, memcpy holds a large proportion of execution time in
libs like Vhost, especially for large packets, and this patch can bring
considerable benefits.

The implementation is based on the current DPDK memcpy framework, some
background introduction can be found in these threads:
http://dpdk.org/ml/archives/dev/2014-November/008158.html
http://dpdk.org/ml/archives/dev/2015-January/011800.html

Code changes are:

  1. Read CPUID to check if AVX512 is supported by CPU

  2. Predefine AVX512 macro if AVX512 is enabled by compiler

  3. Implement AVX512 memcpy and choose the right implementation based on
 predefined macros

  4. Decide alignment unit for memcpy perf test based on predefined macros

Zhihong Wang (4):
  lib/librte_eal: Identify AVX512 CPU flag
  mk: Predefine AVX512 macro for compiler
  lib/librte_eal: Optimize memcpy for AVX512 platforms
  app/test: Adjust alignment unit for memcpy perf test

 app/test/test_memcpy_perf.c|   6 +
 .../common/include/arch/x86/rte_cpuflags.h |   2 +
 .../common/include/arch/x86/rte_memcpy.h   | 247 -
 mk/rte.cpuflags.mk |   4 +
 4 files changed, 255 insertions(+), 4 deletions(-)

-- 
2.5.0



[dpdk-dev] [PATCH v5 3/3] examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

2015-12-30 Thread Zhihong Wang
Handle SIGINT and SIGTERM in l3fwd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 examples/l3fwd/main.c | 46 ++
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 5b0c2dd..21a5782 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -41,6 +41,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 

 #include 
 #include 
@@ -75,6 +77,8 @@
 #include 
 #include 

+static volatile bool force_quit;
+
 #define APP_LOOKUP_EXACT_MATCH  0
 #define APP_LOOKUP_LPM  1
 #define DO_RFC_1812_CHECKS
@@ -1553,7 +1557,7 @@ main_loop(__attribute__((unused)) void *dummy)
portid, queueid);
}

-   while (1) {
+   while (!force_quit) {

cur_tsc = rte_rdtsc();

@@ -1781,6 +1785,8 @@ main_loop(__attribute__((unused)) void *dummy)

}
}
+
+   return 0;
 }

 static int
@@ -2516,8 +2522,12 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
+   if (force_quit)
+   return;
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
+   if (force_quit)
+   return;
if ((port_mask & (1 << portid)) == 0)
continue;
memset(, 0, sizeof(link));
@@ -2559,6 +2569,16 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\n\nSignal %d received, preparing to exit...\n",
+   signum);
+   force_quit = true;
+   }
+}
+
 int
 main(int argc, char **argv)
 {
@@ -2579,6 +2599,10 @@ main(int argc, char **argv)
argc -= ret;
argv += ret;

+   force_quit = false;
+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
/* pre-init dst MACs for all ports to 02:00:00:00:00:xx */
for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++) {
dest_eth_addr[portid] = ETHER_LOCAL_ADMIN_ADDR + 
((uint64_t)portid << 40);
@@ -2733,12 +2757,26 @@ main(int argc, char **argv)

check_all_ports_link_status((uint8_t)nb_ports, enabled_port_mask);

+   ret = 0;
/* launch per-lcore init on every lcore */
rte_eal_mp_remote_launch(main_loop, NULL, CALL_MASTER);
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
-   if (rte_eal_wait_lcore(lcore_id) < 0)
-   return -1;
+   if (rte_eal_wait_lcore(lcore_id) < 0) {
+   ret = -1;
+   break;
+   }
}

-   return 0;
+   /* stop ports */
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   printf("Closing port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   printf("Bye...\n");
+
+   return ret;
 }
-- 
2.5.0



[dpdk-dev] [PATCH v5 2/3] examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd

2015-12-30 Thread Zhihong Wang
Handle SIGINT and SIGTERM in l2fwd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 examples/l2fwd/main.c | 43 +++
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c
index 720fd5a..f35d8a1 100644
--- a/examples/l2fwd/main.c
+++ b/examples/l2fwd/main.c
@@ -44,6 +44,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 

 #include 
 #include 
@@ -69,6 +71,8 @@
 #include 
 #include 

+static volatile bool force_quit;
+
 #define RTE_LOGTYPE_L2FWD RTE_LOGTYPE_USER1

 #define NB_MBUF   8192
@@ -283,7 +287,7 @@ l2fwd_main_loop(void)
portid);
}

-   while (1) {
+   while (!force_quit) {

cur_tsc = rte_rdtsc();

@@ -491,8 +495,12 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
+   if (force_quit)
+   return;
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
+   if (force_quit)
+   return;
if ((port_mask & (1 << portid)) == 0)
continue;
memset(, 0, sizeof(link));
@@ -534,6 +542,16 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\n\nSignal %d received, preparing to exit...\n",
+   signum);
+   force_quit = true;
+   }
+}
+
 int
 main(int argc, char **argv)
 {
@@ -553,6 +571,10 @@ main(int argc, char **argv)
argc -= ret;
argv += ret;

+   force_quit = false;
+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
/* parse application arguments (after the EAL ones) */
ret = l2fwd_parse_args(argc, argv);
if (ret < 0)
@@ -696,12 +718,25 @@ main(int argc, char **argv)

check_all_ports_link_status(nb_ports, l2fwd_enabled_port_mask);

+   ret = 0;
/* launch per-lcore init on every lcore */
rte_eal_mp_remote_launch(l2fwd_launch_one_lcore, NULL, CALL_MASTER);
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
-   if (rte_eal_wait_lcore(lcore_id) < 0)
-   return -1;
+   if (rte_eal_wait_lcore(lcore_id) < 0) {
+   ret = -1;
+   break;
+   }
}

-   return 0;
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   printf("Closing port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   printf("Bye...\n");
+
+   return ret;
 }
-- 
2.5.0



[dpdk-dev] [PATCH v5 1/3] app/test-pmd: Handle SIGINT and SIGTERM in testpmd

2015-12-30 Thread Zhihong Wang
Handle SIGINT and SIGTERM in testpmd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 app/test-pmd/cmdline.c | 20 +---
 app/test-pmd/testpmd.c | 39 +--
 app/test-pmd/testpmd.h |  1 +
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 73298c9..6d28c1b 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -90,6 +90,8 @@

 #include "testpmd.h"

+static struct cmdline *testpmd_cl;
+
 static void cmd_reconfig_device_queue(portid_t id, uint8_t dev, uint8_t queue);

 #ifdef RTE_NIC_BYPASS
@@ -9778,17 +9780,21 @@ cmdline_parse_ctx_t main_ctx[] = {
 void
 prompt(void)
 {
-   struct cmdline *cl;
-
/* initialize non-constant commands */
cmd_set_fwd_mode_init();

-   cl = cmdline_stdin_new(main_ctx, "testpmd> ");
-   if (cl == NULL) {
+   testpmd_cl = cmdline_stdin_new(main_ctx, "testpmd> ");
+   if (testpmd_cl == NULL)
return;
-   }
-   cmdline_interact(cl);
-   cmdline_stdin_exit(cl);
+   cmdline_interact(testpmd_cl);
+   cmdline_stdin_exit(testpmd_cl);
+}
+
+void
+prompt_exit(void)
+{
+   if (testpmd_cl != NULL)
+   cmdline_quit(testpmd_cl);
 }

 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 98ae46d..1319917 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1570,13 +1570,16 @@ pmd_test_exit(void)
if (test_done == 0)
stop_packet_forwarding();

-   FOREACH_PORT(pt_id, ports) {
-   printf("Stopping port %d...", pt_id);
-   fflush(stdout);
-   rte_eth_dev_close(pt_id);
-   printf("done\n");
+   if (ports != NULL) {
+   no_link_check = 1;
+   FOREACH_PORT(pt_id, ports) {
+   printf("\nShutting down port %d...\n", pt_id);
+   fflush(stdout);
+   stop_port(pt_id);
+   close_port(pt_id);
+   }
}
-   printf("bye...\n");
+   printf("\nBye...\n");
 }

 typedef void (*cmd_func_t)(void);
@@ -1984,12 +1987,35 @@ init_port(void)
ports[pid].enabled = 1;
 }

+static void
+force_quit(void)
+{
+   pmd_test_exit();
+   prompt_exit();
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   force_quit();
+   /* exit with the expected status */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+}
+
 int
 main(int argc, char** argv)
 {
int  diag;
uint8_t port_id;

+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
diag = rte_eal_init(argc, argv);
if (diag < 0)
rte_panic("Cannot init EAL\n");
@@ -2041,6 +2067,7 @@ main(int argc, char** argv)
start_packet_forwarding(0);
printf("Press enter to exit\n");
rc = read(0, , 1);
+   pmd_test_exit();
if (rc < 0)
return 1;
}
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index ee7de98..7ffc17b 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -462,6 +462,7 @@ unsigned int parse_item_list(char* str, const char* 
item_name,
unsigned int *parsed_items, int check_unique_values);
 void launch_args_parse(int argc, char** argv);
 void prompt(void);
+void prompt_exit(void);
 void nic_stats_display(portid_t port_id);
 void nic_stats_clear(portid_t port_id);
 void nic_xstats_display(portid_t port_id);
-- 
2.5.0



[dpdk-dev] [PATCH v5 0/3] Handle SIGINT and SIGTERM in DPDK examples

2015-12-30 Thread Zhihong Wang
This patch handles SIGINT and SIGTERM in testpmd, l2fwd, and l3fwd, make sure 
all ports are properly stopped and closed.
For virtual ports, the stop and close function may deal with resource cleanup, 
such as socket files unlinking.

--
Changes in v5:

1. Get rid of over complicated logic in l2fwd and l3fwd

--
Changes in v4:

1. Add port status control in l2fwd and l3fwd

--
Changes in v3:

1. Make sure correct port operations regarding status

2. Small fixes to make the code clearer

--
Changes in v2:

1. Make sure graceful exit for all running phases

2. Make sure program exits with the right status

Zhihong Wang (3):
  app/test-pmd: Handle SIGINT and SIGTERM in testpmd
  examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd
  examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

 app/test-pmd/cmdline.c | 20 +---
 app/test-pmd/testpmd.c | 39 +--
 app/test-pmd/testpmd.h |  1 +
 examples/l2fwd/main.c  | 43 +++
 examples/l3fwd/main.c  | 46 ++
 5 files changed, 128 insertions(+), 21 deletions(-)

-- 
2.5.0



[dpdk-dev] [PATCH v4 3/3] examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

2015-12-29 Thread Zhihong Wang
Handle SIGINT and SIGTERM in l3fwd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 examples/l3fwd/main.c | 167 +++---
 1 file changed, 145 insertions(+), 22 deletions(-)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 5b0c2dd..f73d2a4 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -41,6 +41,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 

 #include 
 #include 
@@ -75,6 +78,16 @@
 #include 
 #include 

+#define PORT_IDLE 0
+#define PORT_INIT 1
+#define PORT_WORK 2
+#define PORT_STOP 3
+#define PORT_QUIT 4
+
+static volatile uint32_t port_status;
+static volatile bool force_quit;
+static volatile int signo_quit;
+
 #define APP_LOOKUP_EXACT_MATCH  0
 #define APP_LOOKUP_LPM  1
 #define DO_RFC_1812_CHECKS
@@ -1553,8 +1566,7 @@ main_loop(__attribute__((unused)) void *dummy)
portid, queueid);
}

-   while (1) {
-
+   while (!force_quit) {
cur_tsc = rte_rdtsc();

/*
@@ -2516,8 +2528,12 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
+   if (force_quit)
+   return;
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
+   if (force_quit)
+   return;
if ((port_mask & (1 << portid)) == 0)
continue;
memset(, 0, sizeof(link));
@@ -2559,6 +2575,101 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static uint8_t
+start_ports(void)
+{
+   unsigned portid, nb_ports, avail_ports;
+   int ret;
+
+   if (rte_atomic32_cmpset(_status,
+   PORT_IDLE, PORT_INIT) == 0) {
+   printf("Ports not idle...\n");
+   return 0;
+   }
+
+   nb_ports = rte_eth_dev_count();
+   avail_ports = 0;
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   avail_ports++;
+   printf("Starting port %d...", portid);
+   ret = rte_eth_dev_start(portid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_eth_dev_start: err=%d, port=%d\n",
+   ret, portid);
+   /*
+* If enabled, put device in promiscuous mode.
+* This allows IO forwarding mode to forward packets
+* to itself through 2 cross-connected  ports of the
+* target machine.
+*/
+   if (promiscuous_on)
+   rte_eth_promiscuous_enable(portid);
+   printf(" Done\n");
+   }
+
+   if (avail_ports) {
+   if (rte_atomic32_cmpset(_status,
+   PORT_INIT, PORT_WORK) == 0)
+   printf("Set port state failed!\n");
+   } else {
+   if (rte_atomic32_cmpset(_status,
+   PORT_INIT, PORT_IDLE) == 0)
+   printf("Set port state failed!\n");
+   }
+
+   return avail_ports;
+}
+
+static void
+stop_ports(void)
+{
+   unsigned portid, nb_ports;
+
+   if (rte_atomic32_cmpset(_status,
+   PORT_WORK, PORT_STOP) == 0) {
+   printf("Ports not started...\n");
+   return;
+   }
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+
+   if (rte_atomic32_cmpset(_status,
+   PORT_STOP, PORT_IDLE) == 0)
+   printf("Set port state failed!\n");
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   if (rte_atomic32_cmpset(_status,
+   PORT_IDLE, PORT_QUIT) == 0) {
+   printf("Ports started already...\n");
+   signo_quit = signum;
+   force_quit = true;
+   

[dpdk-dev] [PATCH v4 2/3] examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd

2015-12-29 Thread Zhihong Wang
Handle SIGINT and SIGTERM in l2fwd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 examples/l2fwd/main.c | 161 +++---
 1 file changed, 139 insertions(+), 22 deletions(-)

diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c
index 720fd5a..9a6f80b 100644
--- a/examples/l2fwd/main.c
+++ b/examples/l2fwd/main.c
@@ -44,6 +44,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 

 #include 
 #include 
@@ -69,6 +72,16 @@
 #include 
 #include 

+#define PORT_IDLE 0
+#define PORT_INIT 1
+#define PORT_WORK 2
+#define PORT_STOP 3
+#define PORT_QUIT 4
+
+static volatile uint32_t port_status;
+static volatile bool force_quit;
+static volatile int signo_quit;
+
 #define RTE_LOGTYPE_L2FWD RTE_LOGTYPE_USER1

 #define NB_MBUF   8192
@@ -283,8 +296,7 @@ l2fwd_main_loop(void)
portid);
}

-   while (1) {
-
+   while (!force_quit) {
cur_tsc = rte_rdtsc();

/*
@@ -491,8 +503,12 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
+   if (force_quit)
+   return;
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
+   if (force_quit)
+   return;
if ((port_mask & (1 << portid)) == 0)
continue;
memset(, 0, sizeof(link));
@@ -534,18 +550,110 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static uint8_t
+start_ports(void)
+{
+   unsigned portid, nb_ports, avail_ports;
+   int ret;
+
+   if (rte_atomic32_cmpset(_status,
+   PORT_IDLE, PORT_INIT) == 0) {
+   printf("Ports not idle...\n");
+   return 0;
+   }
+
+   nb_ports = rte_eth_dev_count();
+   avail_ports = 0;
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   avail_ports++;
+   printf("Starting port %d...", portid);
+   ret = rte_eth_dev_start(portid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_eth_dev_start:err=%d, port=%u\n",
+   ret, (unsigned) portid);
+   rte_eth_promiscuous_enable(portid);
+   printf(" Done\n");
+   }
+
+   if (avail_ports) {
+   if (rte_atomic32_cmpset(_status,
+   PORT_INIT, PORT_WORK) == 0)
+   printf("Set port state failed!\n");
+   } else {
+   if (rte_atomic32_cmpset(_status,
+   PORT_INIT, PORT_IDLE) == 0)
+   printf("Set port state failed!\n");
+   }
+
+   return avail_ports;
+}
+
+static void
+stop_ports(void)
+{
+   unsigned portid, nb_ports;
+
+   if (rte_atomic32_cmpset(_status,
+   PORT_WORK, PORT_STOP) == 0) {
+   printf("Ports not started...\n");
+   return;
+   }
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+
+   if (rte_atomic32_cmpset(_status,
+   PORT_STOP, PORT_IDLE) == 0)
+   printf("Set port state failed!\n");
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   if (rte_atomic32_cmpset(_status,
+   PORT_IDLE, PORT_QUIT) == 0) {
+   printf("Ports started already...\n");
+   signo_quit = signum;
+   force_quit = true;
+   } else {
+   printf("Ports not started yet...\n");
+   printf("Bye...\n");
+   /* exit with the expected status */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+   }
+}
+
 int
 main(int argc, char **argv)
 {
struct lcore_queue_

[dpdk-dev] [PATCH v4 1/3] app/test-pmd: Handle SIGINT and SIGTERM in testpmd

2015-12-29 Thread Zhihong Wang
Handle SIGINT and SIGTERM in testpmd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 app/test-pmd/cmdline.c | 20 +---
 app/test-pmd/testpmd.c | 39 +--
 app/test-pmd/testpmd.h |  1 +
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 73298c9..6d28c1b 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -90,6 +90,8 @@

 #include "testpmd.h"

+static struct cmdline *testpmd_cl;
+
 static void cmd_reconfig_device_queue(portid_t id, uint8_t dev, uint8_t queue);

 #ifdef RTE_NIC_BYPASS
@@ -9778,17 +9780,21 @@ cmdline_parse_ctx_t main_ctx[] = {
 void
 prompt(void)
 {
-   struct cmdline *cl;
-
/* initialize non-constant commands */
cmd_set_fwd_mode_init();

-   cl = cmdline_stdin_new(main_ctx, "testpmd> ");
-   if (cl == NULL) {
+   testpmd_cl = cmdline_stdin_new(main_ctx, "testpmd> ");
+   if (testpmd_cl == NULL)
return;
-   }
-   cmdline_interact(cl);
-   cmdline_stdin_exit(cl);
+   cmdline_interact(testpmd_cl);
+   cmdline_stdin_exit(testpmd_cl);
+}
+
+void
+prompt_exit(void)
+{
+   if (testpmd_cl != NULL)
+   cmdline_quit(testpmd_cl);
 }

 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 98ae46d..1319917 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1570,13 +1570,16 @@ pmd_test_exit(void)
if (test_done == 0)
stop_packet_forwarding();

-   FOREACH_PORT(pt_id, ports) {
-   printf("Stopping port %d...", pt_id);
-   fflush(stdout);
-   rte_eth_dev_close(pt_id);
-   printf("done\n");
+   if (ports != NULL) {
+   no_link_check = 1;
+   FOREACH_PORT(pt_id, ports) {
+   printf("\nShutting down port %d...\n", pt_id);
+   fflush(stdout);
+   stop_port(pt_id);
+   close_port(pt_id);
+   }
}
-   printf("bye...\n");
+   printf("\nBye...\n");
 }

 typedef void (*cmd_func_t)(void);
@@ -1984,12 +1987,35 @@ init_port(void)
ports[pid].enabled = 1;
 }

+static void
+force_quit(void)
+{
+   pmd_test_exit();
+   prompt_exit();
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   force_quit();
+   /* exit with the expected status */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+}
+
 int
 main(int argc, char** argv)
 {
int  diag;
uint8_t port_id;

+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
diag = rte_eal_init(argc, argv);
if (diag < 0)
rte_panic("Cannot init EAL\n");
@@ -2041,6 +2067,7 @@ main(int argc, char** argv)
start_packet_forwarding(0);
printf("Press enter to exit\n");
rc = read(0, , 1);
+   pmd_test_exit();
if (rc < 0)
return 1;
}
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index ee7de98..7ffc17b 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -462,6 +462,7 @@ unsigned int parse_item_list(char* str, const char* 
item_name,
unsigned int *parsed_items, int check_unique_values);
 void launch_args_parse(int argc, char** argv);
 void prompt(void);
+void prompt_exit(void);
 void nic_stats_display(portid_t port_id);
 void nic_stats_clear(portid_t port_id);
 void nic_xstats_display(portid_t port_id);
-- 
2.5.0



[dpdk-dev] [PATCH v4 0/3] Handle SIGINT and SIGTERM in DPDK examples

2015-12-29 Thread Zhihong Wang
This patch handles SIGINT and SIGTERM in testpmd, l2fwd, and l3fwd, make sure 
all ports are properly stopped and closed.
For virtual ports, the stop and close function may deal with resource cleanup, 
such as socket files unlinking.

--
Changes in v4:

1. Add port status control in l2fwd and l3fwd

--
Changes in v3:

1. Make sure correct port operations regarding status

2. Small fixes to make the code clearer

--
Changes in v2:

1. Make sure graceful exit for all running phases

2. Make sure program exits with the right status

Zhihong Wang (3):
  app/test-pmd: Handle SIGINT and SIGTERM in testpmd
  examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd
  examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

 app/test-pmd/cmdline.c |  20 +++---
 app/test-pmd/testpmd.c |  39 ++--
 app/test-pmd/testpmd.h |   1 +
 examples/l2fwd/main.c  | 161 ---
 examples/l3fwd/main.c  | 167 ++---
 5 files changed, 331 insertions(+), 57 deletions(-)

-- 
2.5.0



[dpdk-dev] [PATCH v3 3/3] examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

2015-12-28 Thread Zhihong Wang
Handle SIGINT and SIGTERM in l3fwd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 examples/l3fwd/main.c | 129 +-
 1 file changed, 107 insertions(+), 22 deletions(-)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 5b0c2dd..c766cf5 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -41,6 +41,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 

 #include 
 #include 
@@ -75,6 +78,10 @@
 #include 
 #include 

+static volatile bool port_started;
+static volatile bool force_quit;
+static volatile int signo_quit;
+
 #define APP_LOOKUP_EXACT_MATCH  0
 #define APP_LOOKUP_LPM  1
 #define DO_RFC_1812_CHECKS
@@ -1553,8 +1560,7 @@ main_loop(__attribute__((unused)) void *dummy)
portid, queueid);
}

-   while (1) {
-
+   while (!force_quit) {
cur_tsc = rte_rdtsc();

/*
@@ -2516,8 +2522,12 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
+   if (force_quit)
+   return;
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
+   if (force_quit)
+   return;
if ((port_mask & (1 << portid)) == 0)
continue;
memset(, 0, sizeof(link));
@@ -2559,6 +2569,76 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static uint8_t
+start_ports(void)
+{
+   unsigned portid, nb_ports, avail_ports;
+   int ret;
+
+   nb_ports = rte_eth_dev_count();
+   avail_ports = 0;
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   avail_ports++;
+   port_started = true;
+   printf("Starting port %d...", portid);
+   ret = rte_eth_dev_start(portid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_eth_dev_start: err=%d, port=%d\n",
+   ret, portid);
+   /*
+* If enabled, put device in promiscuous mode.
+* This allows IO forwarding mode to forward packets
+* to itself through 2 cross-connected  ports of the
+* target machine.
+*/
+   if (promiscuous_on)
+   rte_eth_promiscuous_enable(portid);
+   printf(" Done\n");
+   }
+
+   return avail_ports;
+}
+
+static void
+stop_ports(void)
+{
+   unsigned portid, nb_ports;
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   port_started = false;
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   if (port_started) {
+   printf("Ports started already...\n");
+   signo_quit = signum;
+   force_quit = true;
+   } else {
+   printf("Ports not started yet...\n");
+   printf("Bye...\n");
+   /* exit with the expected status */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+   }
+}
+
 int
 main(int argc, char **argv)
 {
@@ -2571,6 +2651,12 @@ main(int argc, char **argv)
unsigned lcore_id;
uint32_t n_tx_queue, nb_lcores;
uint8_t portid, nb_rx_queue, queue, socketid;
+   uint8_t avail_ports;
+
+   port_started = false;
+   force_quit = false;
+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);

/* init EAL */
ret = rte_eal_init(argc, argv);
@@ -2711,34 +2797,33 @@ main(int argc, char **argv)
printf("\n");

/* start ports */
-   for (portid = 0; portid < nb_ports; portid++) {
-   if ((enabled_port_mask & (1 << portid)) == 0) {
-   continue;
-   }
-   /* Start device */
-   ret = rte_eth_dev

[dpdk-dev] [PATCH v3 2/3] examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd

2015-12-28 Thread Zhihong Wang
Handle SIGINT and SIGTERM in l2fwd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 examples/l2fwd/main.c | 123 +-
 1 file changed, 101 insertions(+), 22 deletions(-)

diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c
index 720fd5a..ecd5d2b 100644
--- a/examples/l2fwd/main.c
+++ b/examples/l2fwd/main.c
@@ -44,6 +44,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 

 #include 
 #include 
@@ -69,6 +72,10 @@
 #include 
 #include 

+static volatile bool port_started;
+static volatile bool force_quit;
+static volatile int signo_quit;
+
 #define RTE_LOGTYPE_L2FWD RTE_LOGTYPE_USER1

 #define NB_MBUF   8192
@@ -283,8 +290,7 @@ l2fwd_main_loop(void)
portid);
}

-   while (1) {
-
+   while (!force_quit) {
cur_tsc = rte_rdtsc();

/*
@@ -491,8 +497,12 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
+   if (force_quit)
+   return;
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
+   if (force_quit)
+   return;
if ((port_mask & (1 << portid)) == 0)
continue;
memset(, 0, sizeof(link));
@@ -534,18 +544,85 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static uint8_t
+start_ports(void)
+{
+   unsigned portid, nb_ports, avail_ports;
+   int ret;
+
+   nb_ports = rte_eth_dev_count();
+   avail_ports = 0;
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   avail_ports++;
+   port_started = true;
+   printf("Starting port %d...", portid);
+   ret = rte_eth_dev_start(portid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_eth_dev_start:err=%d, port=%u\n",
+ ret, (unsigned) portid);
+   rte_eth_promiscuous_enable(portid);
+   printf(" Done\n");
+   }
+
+   return avail_ports;
+}
+
+static void
+stop_ports(void)
+{
+   unsigned portid, nb_ports;
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   port_started = false;
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   if (port_started) {
+   printf("Ports started already...\n");
+   signo_quit = signum;
+   force_quit = true;
+   } else {
+   printf("Ports not started yet...\n");
+   printf("Bye...\n");
+   /* exit with the expected status */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+   }
+}
+
 int
 main(int argc, char **argv)
 {
struct lcore_queue_conf *qconf;
struct rte_eth_dev_info dev_info;
int ret;
-   uint8_t nb_ports;
-   uint8_t nb_ports_available;
+   uint8_t nb_ports, avail_ports;
uint8_t portid, last_port;
unsigned lcore_id, rx_lcore_id;
unsigned nb_ports_in_mask = 0;

+   port_started = false;
+   force_quit = false;
+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
/* init EAL */
ret = rte_eal_init(argc, argv);
if (ret < 0)
@@ -627,14 +704,11 @@ main(int argc, char **argv)
printf("Lcore %u: RX port %u\n", rx_lcore_id, (unsigned) 
portid);
}

-   nb_ports_available = nb_ports;
-
/* Initialise each port */
for (portid = 0; portid < nb_ports; portid++) {
/* skip ports that are not enabled */
if ((l2fwd_enabled_port_mask & (1 << portid)) == 0) {
printf("Skipping disabled port %u\n", (unsigned) 
portid);
-   nb_ports_available--;
continue;
   

[dpdk-dev] [PATCH v3 1/3] app/test-pmd: Handle SIGINT and SIGTERM in testpmd

2015-12-28 Thread Zhihong Wang
Handle SIGINT and SIGTERM in testpmd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 app/test-pmd/cmdline.c | 20 +---
 app/test-pmd/testpmd.c | 39 +--
 app/test-pmd/testpmd.h |  1 +
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 73298c9..6d28c1b 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -90,6 +90,8 @@

 #include "testpmd.h"

+static struct cmdline *testpmd_cl;
+
 static void cmd_reconfig_device_queue(portid_t id, uint8_t dev, uint8_t queue);

 #ifdef RTE_NIC_BYPASS
@@ -9778,17 +9780,21 @@ cmdline_parse_ctx_t main_ctx[] = {
 void
 prompt(void)
 {
-   struct cmdline *cl;
-
/* initialize non-constant commands */
cmd_set_fwd_mode_init();

-   cl = cmdline_stdin_new(main_ctx, "testpmd> ");
-   if (cl == NULL) {
+   testpmd_cl = cmdline_stdin_new(main_ctx, "testpmd> ");
+   if (testpmd_cl == NULL)
return;
-   }
-   cmdline_interact(cl);
-   cmdline_stdin_exit(cl);
+   cmdline_interact(testpmd_cl);
+   cmdline_stdin_exit(testpmd_cl);
+}
+
+void
+prompt_exit(void)
+{
+   if (testpmd_cl != NULL)
+   cmdline_quit(testpmd_cl);
 }

 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 98ae46d..1319917 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1570,13 +1570,16 @@ pmd_test_exit(void)
if (test_done == 0)
stop_packet_forwarding();

-   FOREACH_PORT(pt_id, ports) {
-   printf("Stopping port %d...", pt_id);
-   fflush(stdout);
-   rte_eth_dev_close(pt_id);
-   printf("done\n");
+   if (ports != NULL) {
+   no_link_check = 1;
+   FOREACH_PORT(pt_id, ports) {
+   printf("\nShutting down port %d...\n", pt_id);
+   fflush(stdout);
+   stop_port(pt_id);
+   close_port(pt_id);
+   }
}
-   printf("bye...\n");
+   printf("\nBye...\n");
 }

 typedef void (*cmd_func_t)(void);
@@ -1984,12 +1987,35 @@ init_port(void)
ports[pid].enabled = 1;
 }

+static void
+force_quit(void)
+{
+   pmd_test_exit();
+   prompt_exit();
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   force_quit();
+   /* exit with the expected status */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+}
+
 int
 main(int argc, char** argv)
 {
int  diag;
uint8_t port_id;

+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
diag = rte_eal_init(argc, argv);
if (diag < 0)
rte_panic("Cannot init EAL\n");
@@ -2041,6 +2067,7 @@ main(int argc, char** argv)
start_packet_forwarding(0);
printf("Press enter to exit\n");
rc = read(0, , 1);
+   pmd_test_exit();
if (rc < 0)
return 1;
}
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index ee7de98..7ffc17b 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -462,6 +462,7 @@ unsigned int parse_item_list(char* str, const char* 
item_name,
unsigned int *parsed_items, int check_unique_values);
 void launch_args_parse(int argc, char** argv);
 void prompt(void);
+void prompt_exit(void);
 void nic_stats_display(portid_t port_id);
 void nic_stats_clear(portid_t port_id);
 void nic_xstats_display(portid_t port_id);
-- 
2.5.0



[dpdk-dev] [PATCH v3 0/3] Handle SIGINT and SIGTERM in DPDK examples

2015-12-28 Thread Zhihong Wang
This patch handles SIGINT and SIGTERM in testpmd, l2fwd, and l3fwd, make sure 
all ports are properly stopped and closed.
For virtual ports, the stop and close function may deal with resource cleanup, 
such as socket files unlinking.

--
Changes in v3:

1. Make sure correct port operations regarding status

2. Small fixes to make the code clearer

--
Changes in v2:

1. Make sure graceful exit for all running phases

2. Make sure program exits with the right status

Zhihong Wang (3):
  app/test-pmd: Handle SIGINT and SIGTERM in testpmd
  examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd
  examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

 app/test-pmd/cmdline.c |  20 +---
 app/test-pmd/testpmd.c |  39 ---
 app/test-pmd/testpmd.h |   1 +
 examples/l2fwd/main.c  | 123 +-
 examples/l3fwd/main.c  | 129 -
 5 files changed, 255 insertions(+), 57 deletions(-)

-- 
2.5.0



[dpdk-dev] [PATCH v2 3/3] examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

2015-12-24 Thread Zhihong Wang
Handle SIGINT and SIGTERM in l3fwd.

Signed-off-by: Zhihong Wang 
---
 examples/l3fwd/main.c | 110 +-
 1 file changed, 90 insertions(+), 20 deletions(-)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 5b0c2dd..b9f3232 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -41,6 +41,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 

 #include 
 #include 
@@ -75,6 +77,9 @@
 #include 
 #include 

+static int force_quit = -1;
+static int signo_quit = -1;
+
 #define APP_LOOKUP_EXACT_MATCH  0
 #define APP_LOOKUP_LPM  1
 #define DO_RFC_1812_CHECKS
@@ -1554,6 +1559,8 @@ main_loop(__attribute__((unused)) void *dummy)
}

while (1) {
+   if (unlikely(force_quit != 0))
+   break;

cur_tsc = rte_rdtsc();

@@ -2559,6 +2566,74 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static void
+start_ports(void)
+{
+   unsigned portid, nb_ports;
+   int ret;
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0) {
+   continue;
+   }
+   printf("Starting port %d...", portid);
+   ret = rte_eth_dev_start(portid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_eth_dev_start: err=%d, port=%d\n",
+   ret, portid);
+   /*
+* If enabled, put device in promiscuous mode.
+* This allows IO forwarding mode to forward packets
+* to itself through 2 cross-connected  ports of the
+* target machine.
+*/
+   if (promiscuous_on)
+   rte_eth_promiscuous_enable(portid);
+   printf(" Done\n");
+   }
+}
+
+static void
+stop_ports(void)
+{
+   unsigned portid, nb_ports;
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0) {
+   continue;
+   }
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+}
+
+static void
+signal_handler(__rte_unused int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   if (force_quit < 0) {
+   printf("Forwarding not started yet...\n");
+   /* stop ports */
+   stop_ports();
+   printf("Bye...\n");
+   /* inform if there's a caller */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   } else {
+   printf("Forwarding started already...\n");
+   signo_quit = signum;
+   force_quit = 1;
+   }
+   }
+}
+
 int
 main(int argc, char **argv)
 {
@@ -2572,6 +2647,9 @@ main(int argc, char **argv)
uint32_t n_tx_queue, nb_lcores;
uint8_t portid, nb_rx_queue, queue, socketid;

+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
/* init EAL */
ret = rte_eal_init(argc, argv);
if (ret < 0)
@@ -2711,34 +2789,26 @@ main(int argc, char **argv)
printf("\n");

/* start ports */
-   for (portid = 0; portid < nb_ports; portid++) {
-   if ((enabled_port_mask & (1 << portid)) == 0) {
-   continue;
-   }
-   /* Start device */
-   ret = rte_eth_dev_start(portid);
-   if (ret < 0)
-   rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, 
port=%d\n",
-   ret, portid);
-
-   /*
-* If enabled, put device in promiscuous mode.
-* This allows IO forwarding mode to forward packets
-* to itself through 2 cross-connected  ports of the
-* target machine.
-*/
-   if (promiscuous_on)
-   rte_eth_promiscuous_enable(portid);
-   }
-
+   start_ports();
check_all_ports_link_status((uint8_t)nb_ports, enabled_port_mask);

/* launch per-lcore init on every lcore */
+   force_quit = 0;
rte_eal_mp_remote_launch(main_loop, NULL, CALL_MASTER);
RTE_LCORE_FOREACH_SL

[dpdk-dev] [PATCH v2 2/3] examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd

2015-12-24 Thread Zhihong Wang
Handle SIGINT and SIGTERM in l2fwd.

Signed-off-by: Zhihong Wang 
---
 examples/l2fwd/main.c | 60 +++
 1 file changed, 60 insertions(+)

diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c
index 720fd5a..75899dd 100644
--- a/examples/l2fwd/main.c
+++ b/examples/l2fwd/main.c
@@ -44,6 +44,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 

 #include 
 #include 
@@ -69,6 +71,9 @@
 #include 
 #include 

+static int force_quit = -1;
+static int signo_quit = -1;
+
 #define RTE_LOGTYPE_L2FWD RTE_LOGTYPE_USER1

 #define NB_MBUF   8192
@@ -284,6 +289,8 @@ l2fwd_main_loop(void)
}

while (1) {
+   if (unlikely(force_quit != 0))
+   break;

cur_tsc = rte_rdtsc();

@@ -534,6 +541,45 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static void
+stop_ports(void)
+{
+   unsigned portid, nb_ports;
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0) {
+   continue;
+   }
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+}
+
+static void
+signal_handler(__rte_unused int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   if (force_quit < 0) {
+   printf("Forwarding not started yet...\n");
+   /* stop ports */
+   stop_ports();
+   printf("Bye...\n");
+   /* inform if there's a caller */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   } else {
+   printf("Forwarding started already...\n");
+   signo_quit = signum;
+   force_quit = 1;
+   }
+   }
+}
+
 int
 main(int argc, char **argv)
 {
@@ -546,6 +592,9 @@ main(int argc, char **argv)
unsigned lcore_id, rx_lcore_id;
unsigned nb_ports_in_mask = 0;

+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
/* init EAL */
ret = rte_eal_init(argc, argv);
if (ret < 0)
@@ -697,11 +746,22 @@ main(int argc, char **argv)
check_all_ports_link_status(nb_ports, l2fwd_enabled_port_mask);

/* launch per-lcore init on every lcore */
+   force_quit = 0;
rte_eal_mp_remote_launch(l2fwd_launch_one_lcore, NULL, CALL_MASTER);
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
if (rte_eal_wait_lcore(lcore_id) < 0)
return -1;
}

+   printf("Stopping forwarding... Done\n");
+   /* stop ports */
+   stop_ports();
+   printf("Bye...\n");
+   /* inform if there's a caller */
+   if (force_quit != 0) {
+   signal(signo_quit, SIG_DFL);
+   kill(getpid(), signo_quit);
+   }
+
return 0;
 }
-- 
2.5.0



[dpdk-dev] [PATCH v2 1/3] app/test-pmd: Handle SIGINT and SIGTERM in testpmd

2015-12-24 Thread Zhihong Wang
Handle SIGINT and SIGTERM in testpmd.

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/cmdline.c | 19 +--
 app/test-pmd/testpmd.c | 38 --
 app/test-pmd/testpmd.h |  1 +
 3 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 73298c9..4ff1739 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -90,6 +90,8 @@

 #include "testpmd.h"

+static struct cmdline *testpmd_cl;
+
 static void cmd_reconfig_device_queue(portid_t id, uint8_t dev, uint8_t queue);

 #ifdef RTE_NIC_BYPASS
@@ -9778,17 +9780,22 @@ cmdline_parse_ctx_t main_ctx[] = {
 void
 prompt(void)
 {
-   struct cmdline *cl;
-
/* initialize non-constant commands */
cmd_set_fwd_mode_init();

-   cl = cmdline_stdin_new(main_ctx, "testpmd> ");
-   if (cl == NULL) {
+   testpmd_cl = cmdline_stdin_new(main_ctx, "testpmd> ");
+   if (testpmd_cl == NULL) {
return;
}
-   cmdline_interact(cl);
-   cmdline_stdin_exit(cl);
+   cmdline_interact(testpmd_cl);
+   cmdline_stdin_exit(testpmd_cl);
+}
+
+void
+prompt_exit(void)
+{
+   if (testpmd_cl != NULL)
+   cmdline_quit(testpmd_cl);
 }

 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 98ae46d..cb38d56 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1570,13 +1570,16 @@ pmd_test_exit(void)
if (test_done == 0)
stop_packet_forwarding();

-   FOREACH_PORT(pt_id, ports) {
-   printf("Stopping port %d...", pt_id);
-   fflush(stdout);
-   rte_eth_dev_close(pt_id);
-   printf("done\n");
+   if (ports != NULL) {
+   FOREACH_PORT(pt_id, ports) {
+   printf("Stopping port %d...", pt_id);
+   fflush(stdout);
+   rte_eth_dev_stop(pt_id);
+   rte_eth_dev_close(pt_id);
+   printf(" Done\n");
+   }
}
-   printf("bye...\n");
+   printf("Bye...\n");
 }

 typedef void (*cmd_func_t)(void);
@@ -1984,12 +1987,34 @@ init_port(void)
ports[pid].enabled = 1;
 }

+static void
+force_quit(void)
+{
+   pmd_test_exit();
+   prompt_exit();
+}
+
+static void
+sigint_handler(__rte_unused int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   force_quit();
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+}
+
 int
 main(int argc, char** argv)
 {
int  diag;
uint8_t port_id;

+   signal(SIGINT, sigint_handler);
+   signal(SIGTERM, sigint_handler);
+
diag = rte_eal_init(argc, argv);
if (diag < 0)
rte_panic("Cannot init EAL\n");
@@ -2041,6 +2066,7 @@ main(int argc, char** argv)
start_packet_forwarding(0);
printf("Press enter to exit\n");
rc = read(0, , 1);
+   pmd_test_exit();
if (rc < 0)
return 1;
}
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index ee7de98..7ffc17b 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -462,6 +462,7 @@ unsigned int parse_item_list(char* str, const char* 
item_name,
unsigned int *parsed_items, int check_unique_values);
 void launch_args_parse(int argc, char** argv);
 void prompt(void);
+void prompt_exit(void);
 void nic_stats_display(portid_t port_id);
 void nic_stats_clear(portid_t port_id);
 void nic_xstats_display(portid_t port_id);
-- 
2.5.0



[dpdk-dev] [PATCH v2 0/3] Handle SIGINT and SIGTERM in DPDK examples

2015-12-24 Thread Zhihong Wang
This patch handles SIGINT and SIGTERM in testpmd, l2fwd, and l3fwd, make sure 
all ports are properly stopped and closed.
For virtual ports, the stop and close function may deal with resource cleanup, 
such as socket files unlinking.

--
Changes in v2:

1. Make sure graceful exit for all running phases

2. Make sure program exits with the right status

Zhihong Wang (3):
  app/test-pmd: Handle SIGINT and SIGTERM in testpmd
  examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd
  examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

 app/test-pmd/cmdline.c |  19 ++---
 app/test-pmd/testpmd.c |  38 ++---
 app/test-pmd/testpmd.h |   1 +
 examples/l2fwd/main.c  |  60 +++
 examples/l3fwd/main.c  | 110 -
 5 files changed, 196 insertions(+), 32 deletions(-)

-- 
2.5.0



[dpdk-dev] [PATCH 3/3] examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

2015-12-23 Thread Zhihong Wang
Handle SIGINT and SIGTERM in l3fwd.

Signed-off-by: Zhihong Wang 
---
 examples/l3fwd/main.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 5b0c2dd..aae16d2 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 

 #include 
 #include 
@@ -2559,6 +2560,27 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+/* When we receive a INT signal, close all ports */
+static void
+sigint_handler(__rte_unused int signum)
+{
+   unsigned portid, nb_ports;
+
+   printf("Preparing to exit...\n");
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0) {
+   continue;
+   }
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   printf("Bye...\n");
+   exit(0);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -2572,6 +2594,9 @@ main(int argc, char **argv)
uint32_t n_tx_queue, nb_lcores;
uint8_t portid, nb_rx_queue, queue, socketid;

+   signal(SIGINT, sigint_handler);
+   signal(SIGTERM, sigint_handler);
+
/* init EAL */
ret = rte_eal_init(argc, argv);
if (ret < 0)
-- 
2.5.0



[dpdk-dev] [PATCH 2/3] examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd

2015-12-23 Thread Zhihong Wang
Handle SIGINT and SIGTERM in l2fwd.

Signed-off-by: Zhihong Wang 
---
 examples/l2fwd/main.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c
index 720fd5a..0594037 100644
--- a/examples/l2fwd/main.c
+++ b/examples/l2fwd/main.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 

 #include 
 #include 
@@ -534,6 +535,27 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+/* When we receive a INT signal, close all ports */
+static void
+sigint_handler(__rte_unused int signum)
+{
+   unsigned portid, nb_ports;
+
+   printf("Preparing to exit...\n");
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0) {
+   continue;
+   }
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   printf("Bye...\n");
+   exit(0);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -546,6 +568,9 @@ main(int argc, char **argv)
unsigned lcore_id, rx_lcore_id;
unsigned nb_ports_in_mask = 0;

+   signal(SIGINT, sigint_handler);
+   signal(SIGTERM, sigint_handler);
+
/* init EAL */
ret = rte_eal_init(argc, argv);
if (ret < 0)
-- 
2.5.0



[dpdk-dev] [PATCH 1/3] app/test-pmd: Handle SIGINT and SIGTERM in testpmd

2015-12-23 Thread Zhihong Wang
Handle SIGINT and SIGTERM in testpmd.

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/testpmd.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 98ae46d..c259ba3 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1573,6 +1573,7 @@ pmd_test_exit(void)
FOREACH_PORT(pt_id, ports) {
printf("Stopping port %d...", pt_id);
fflush(stdout);
+   rte_eth_dev_stop(pt_id);
rte_eth_dev_close(pt_id);
printf("done\n");
}
@@ -1984,12 +1985,34 @@ init_port(void)
ports[pid].enabled = 1;
 }

+/* When we receive a INT signal, close all ports */
+static void
+sigint_handler(__rte_unused int signum)
+{
+   unsigned portid;
+
+   printf("Preparing to exit...\n");
+   FOREACH_PORT(portid, ports) {
+   if (port_id_is_invalid(portid, ENABLED_WARN))
+   continue;
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   printf("Bye...\n");
+   exit(0);
+}
+
 int
 main(int argc, char** argv)
 {
int  diag;
uint8_t port_id;

+   signal(SIGINT, sigint_handler);
+   signal(SIGTERM, sigint_handler);
+
diag = rte_eal_init(argc, argv);
if (diag < 0)
rte_panic("Cannot init EAL\n");
-- 
2.5.0



[dpdk-dev] [PATCH 0/3] Handle SIGINT and SIGTERM in DPDK examples

2015-12-23 Thread Zhihong Wang
This patch handles SIGINT and SIGTERM in testpmd, l2fwd, and l3fwd, make sure 
all ports are properly stopped and closed.
For virtual ports, the stop and close function may deal with resource cleanup, 
such as socket files unlinking.

Zhihong Wang (3):
  app/test-pmd: Handle SIGINT and SIGTERM in testpmd
  examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd
  examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

 app/test-pmd/testpmd.c | 23 +++
 examples/l2fwd/main.c  | 25 +
 examples/l3fwd/main.c  | 25 +
 3 files changed, 73 insertions(+)

-- 
2.5.0



[dpdk-dev] [PATCH] Unlink existing unused sockets at start up

2015-12-16 Thread Zhihong Wang
This patch unlinks existing unused sockets (which cause new bindings to fail, 
e.g. vHost PMD) to ensure smooth startup.
In a lot of cases DPDK applications are terminated abnormally without proper 
resource release. Therefore, DPDK libs should be able to deal with unclean boot 
environment.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_user/vhost-net-user.c | 28 
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c 
b/lib/librte_vhost/vhost_user/vhost-net-user.c
index 8b7a448..eac0721 100644
--- a/lib/librte_vhost/vhost_user/vhost-net-user.c
+++ b/lib/librte_vhost/vhost_user/vhost-net-user.c
@@ -120,18 +120,38 @@ uds_socket(const char *path)
sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
if (sockfd < 0)
return -1;
-   RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd);
+   RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd: %d\n", sockfd);

memset(, 0, sizeof(un));
un.sun_family = AF_UNIX;
snprintf(un.sun_path, sizeof(un.sun_path), "%s", path);
ret = bind(sockfd, (struct sockaddr *), sizeof(un));
if (ret == -1) {
-   RTE_LOG(ERR, VHOST_CONFIG, "fail to bind fd:%d, remove file:%s 
and try again.\n",
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "bind fd: %d to file: %s failed, checking socket...\n",
sockfd, path);
-   goto err;
+   ret = connect(sockfd, (struct sockaddr *), sizeof(un));
+   if (ret == -1) {
+   RTE_LOG(INFO, VHOST_CONFIG,
+   "socket: %s is inactive, rebinding after 
unlink...\n", path);
+   unlink(path);
+   ret = bind(sockfd, (struct sockaddr *), sizeof(un));
+   if (ret == -1) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "bind fd: %d to file: %s failed even 
after unlink\n",
+   sockfd, path);
+   goto err;
+   }
+   } else {
+   RTE_LOG(INFO, VHOST_CONFIG,
+   "socket: %s is alive, remove it and try 
again\n", path);
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "bind fd: %d to file: %s failed\n", sockfd, 
path);
+   goto err;
+   }
}
-   RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+   RTE_LOG(INFO, VHOST_CONFIG,
+   "bind fd: %d to file: %s successful\n", sockfd, path);

ret = listen(sockfd, MAX_VIRTIO_BACKLOG);
if (ret == -1)
-- 
2.5.0



[dpdk-dev] [PATCH 2/2] lib/librte_eal: Remove unnecessary hugepage zero-filling

2015-11-22 Thread Zhihong Wang
The kernel fills new allocated (huge) pages with zeros.
DPDK just has to populate page tables to trigger the allocation.

Signed-off-by: Zhihong Wang 
---
 lib/librte_eal/linuxapp/eal/eal_memory.c | 20 ++--
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 0de75cd..21a5146 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -399,8 +399,10 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
return -1;
}

+   /* map the segment, and populate page tables,
+* the kernel fills this segment with zeros */
virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
-   MAP_SHARED, fd, 0);
+   MAP_SHARED | MAP_POPULATE, fd, 0);
if (virtaddr == MAP_FAILED) {
RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
strerror(errno));
@@ -410,7 +412,6 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,

if (orig) {
hugepg_tbl[i].orig_va = virtaddr;
-   memset(virtaddr, 0, hugepage_sz);
}
else {
hugepg_tbl[i].final_va = virtaddr;
@@ -529,22 +530,16 @@ remap_all_hugepages(struct hugepage_file *hugepg_tbl, 
struct hugepage_info *hpi)

old_addr = vma_addr;

-   /* map new, bigger segment */
+   /* map new, bigger segment, and populate page tables,
+* the kernel fills this segment with zeros */
vma_addr = mmap(vma_addr, total_size,
-   PROT_READ | PROT_WRITE, MAP_SHARED, fd, 
0);
+   PROT_READ | PROT_WRITE, MAP_SHARED | 
MAP_POPULATE, fd, 0);

if (vma_addr == MAP_FAILED || vma_addr != old_addr) {
RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", 
__func__, strerror(errno));
close(fd);
return -1;
}
-
-   /* touch the page. this is needed because kernel 
postpones mapping
-* creation until the first page fault. with this, we 
pin down
-* the page and it is marked as used and gets into 
process' pagemap.
-*/
-   for (offset = 0; offset < total_size; offset += 
hugepage_sz)
-   *((volatile uint8_t*) RTE_PTR_ADD(vma_addr, 
offset));
}

/* set shared flock on the file. */
@@ -592,9 +587,6 @@ remap_all_hugepages(struct hugepage_file *hugepg_tbl, 
struct hugepage_info *hpi)
}
}

-   /* zero out the whole segment */
-   memset(hugepg_tbl[page_idx].final_va, 0, total_size);
-
page_idx++;
}

-- 
2.5.0



[dpdk-dev] [PATCH 1/2] lib/librte_eal: Reduce timer initialization time

2015-11-22 Thread Zhihong Wang
Changing from 1/2 second to 1/10 doesn't compromise the precision, and a 4/10 
second is worth saving.

Signed-off-by: Zhihong Wang 
---
 lib/librte_eal/linuxapp/eal/eal_timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c 
b/lib/librte_eal/linuxapp/eal/eal_timer.c
index e0642de..b40afa0 100644
--- a/lib/librte_eal/linuxapp/eal/eal_timer.c
+++ b/lib/librte_eal/linuxapp/eal/eal_timer.c
@@ -271,7 +271,7 @@ get_tsc_freq(void)
 #ifdef CLOCK_MONOTONIC_RAW
 #define NS_PER_SEC 1E9

-   struct timespec sleeptime = {.tv_nsec = 5E8 }; /* 1/2 second */
+   struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 
second */

struct timespec t_start, t_end;
uint64_t tsc_hz;
-- 
2.5.0



[dpdk-dev] [PATCH 0/2] Reduce DPDK initialization time

2015-11-22 Thread Zhihong Wang
This patch aims to reduce DPDK initialization time, which is important in cases 
such as micro service.

Changes are:

1. Reduce timer initialization time

2. Remove unnecessary hugepage zero-filling operations

With this patch:

1. Timer initialization time can be reduced by 4/10 second

2. Memory initialization time can be reduced nearly by half

The 2nd topic has been brought up before in this thread:
http://dpdk.org/dev/patchwork/patch/4219/

--
Changes in v1:

1. Use macro in sleep time initialization

2. Update commit message according to code change

--
Changes in RFC v2:

1. Use MAP_POPULATE flag to populate page tables

2. Add comments to avoid future misunderstanding

Zhihong Wang (2):
  lib/librte_eal: Reduce timer initialization time
  lib/librte_eal: Remove unnecessary hugepage zero-filling

 lib/librte_eal/linuxapp/eal/eal_memory.c | 20 ++--
 lib/librte_eal/linuxapp/eal/eal_timer.c  |  2 +-
 2 files changed, 7 insertions(+), 15 deletions(-)

-- 
2.5.0



[dpdk-dev] [PATCH RFC v2 2/2] lib/librte_eal: Remove unnecessary hugepage zero-filling

2015-11-19 Thread Zhihong Wang
The kernel fills new allocated (huge) pages with zeros.
DPDK just has to touch the pages to trigger the allocation.

Signed-off-by: Zhihong Wang 
---
 lib/librte_eal/linuxapp/eal/eal_memory.c | 20 ++--
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 0de75cd..21a5146 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -399,8 +399,10 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
return -1;
}

+   /* map the segment, and populate page tables,
+* the kernel fills this segment with zeros */
virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
-   MAP_SHARED, fd, 0);
+   MAP_SHARED | MAP_POPULATE, fd, 0);
if (virtaddr == MAP_FAILED) {
RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
strerror(errno));
@@ -410,7 +412,6 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,

if (orig) {
hugepg_tbl[i].orig_va = virtaddr;
-   memset(virtaddr, 0, hugepage_sz);
}
else {
hugepg_tbl[i].final_va = virtaddr;
@@ -529,22 +530,16 @@ remap_all_hugepages(struct hugepage_file *hugepg_tbl, 
struct hugepage_info *hpi)

old_addr = vma_addr;

-   /* map new, bigger segment */
+   /* map new, bigger segment, and populate page tables,
+* the kernel fills this segment with zeros */
vma_addr = mmap(vma_addr, total_size,
-   PROT_READ | PROT_WRITE, MAP_SHARED, fd, 
0);
+   PROT_READ | PROT_WRITE, MAP_SHARED | 
MAP_POPULATE, fd, 0);

if (vma_addr == MAP_FAILED || vma_addr != old_addr) {
RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", 
__func__, strerror(errno));
close(fd);
return -1;
}
-
-   /* touch the page. this is needed because kernel 
postpones mapping
-* creation until the first page fault. with this, we 
pin down
-* the page and it is marked as used and gets into 
process' pagemap.
-*/
-   for (offset = 0; offset < total_size; offset += 
hugepage_sz)
-   *((volatile uint8_t*) RTE_PTR_ADD(vma_addr, 
offset));
}

/* set shared flock on the file. */
@@ -592,9 +587,6 @@ remap_all_hugepages(struct hugepage_file *hugepg_tbl, 
struct hugepage_info *hpi)
}
}

-   /* zero out the whole segment */
-   memset(hugepg_tbl[page_idx].final_va, 0, total_size);
-
page_idx++;
}

-- 
2.5.0



[dpdk-dev] [PATCH RFC v2 1/2] lib/librte_eal: Reduce timer initialization time

2015-11-19 Thread Zhihong Wang
Changing from 1/2 second to 1/10 doesn't compromise the precision, and a 4/10 
second is worth saving.

Signed-off-by: Zhihong Wang 
---
 lib/librte_eal/linuxapp/eal/eal_timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c 
b/lib/librte_eal/linuxapp/eal/eal_timer.c
index e0642de..4de0353 100644
--- a/lib/librte_eal/linuxapp/eal/eal_timer.c
+++ b/lib/librte_eal/linuxapp/eal/eal_timer.c
@@ -271,7 +271,7 @@ get_tsc_freq(void)
 #ifdef CLOCK_MONOTONIC_RAW
 #define NS_PER_SEC 1E9

-   struct timespec sleeptime = {.tv_nsec = 5E8 }; /* 1/2 second */
+   struct timespec sleeptime = {.tv_nsec = 1E8 }; /* 1/10 second */

struct timespec t_start, t_end;
uint64_t tsc_hz;
-- 
2.5.0



[dpdk-dev] [PATCH RFC v2 0/2] Reduce DPDK initialization time

2015-11-19 Thread Zhihong Wang
This RFC patch aims to reduce DPDK initialization time, which is important in 
cases such as micro service.

Changes are:

1. Reduce timer initialization time

2. Remove unnecessary hugepage zero-filling operations

With this patch:

1. Timer initialization time can be reduced by 4/10 second

2. Memory initialization time can be reduced nearly by half

The 2nd topic has been brought up before in this thread:
http://dpdk.org/dev/patchwork/patch/4219/

--
Changes in v2:

1. Use MAP_POPULATE flag to populate page tables

2. Add comments to avoid future misunderstanding

Zhihong Wang (2):
  lib/librte_eal: Reduce timer initialization time
  lib/librte_eal: Remove unnecessary hugepage zero-filling

 lib/librte_eal/linuxapp/eal/eal_memory.c | 20 ++--
 lib/librte_eal/linuxapp/eal/eal_timer.c  |  2 +-
 2 files changed, 7 insertions(+), 15 deletions(-)

-- 
2.5.0



[dpdk-dev] [RFC PATCH 2/2] lib/librte_eal: Remove unnecessary hugepage zero-filling

2015-11-17 Thread Zhihong Wang
The kernel fills new allocated (huge) pages with zeros.
DPDK just has to touch the pages to trigger the allocation.

Signed-off-by: Zhihong Wang 
---
 lib/librte_eal/linuxapp/eal/eal_memory.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 0de75cd..af823dc 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -410,7 +410,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,

if (orig) {
hugepg_tbl[i].orig_va = virtaddr;
-   memset(virtaddr, 0, hugepage_sz);
+   memset(virtaddr, 0, 8);
}
else {
hugepg_tbl[i].final_va = virtaddr;
@@ -592,9 +592,6 @@ remap_all_hugepages(struct hugepage_file *hugepg_tbl, 
struct hugepage_info *hpi)
}
}

-   /* zero out the whole segment */
-   memset(hugepg_tbl[page_idx].final_va, 0, total_size);
-
page_idx++;
}

-- 
2.5.0



[dpdk-dev] [RFC PATCH 1/2] lib/librte_eal: Reduce timer initialization time

2015-11-17 Thread Zhihong Wang
Changing from 1/2 second to 1/10 doesn't compromise the precision, and a 4/10 
second is worth saving.

Signed-off-by: Zhihong Wang 
---
 lib/librte_eal/linuxapp/eal/eal_timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c 
b/lib/librte_eal/linuxapp/eal/eal_timer.c
index e0642de..4de0353 100644
--- a/lib/librte_eal/linuxapp/eal/eal_timer.c
+++ b/lib/librte_eal/linuxapp/eal/eal_timer.c
@@ -271,7 +271,7 @@ get_tsc_freq(void)
 #ifdef CLOCK_MONOTONIC_RAW
 #define NS_PER_SEC 1E9

-   struct timespec sleeptime = {.tv_nsec = 5E8 }; /* 1/2 second */
+   struct timespec sleeptime = {.tv_nsec = 1E8 }; /* 1/10 second */

struct timespec t_start, t_end;
uint64_t tsc_hz;
-- 
2.5.0



[dpdk-dev] [RFC PATCH 0/2] Reduce DPDK initialization time

2015-11-17 Thread Zhihong Wang
This RFC patch aims to reduce DPDK initialization time, which is important in 
cases such as micro service.

Changes are:

1. Reduce timer initialization time

2. Remove unnecessary hugepage zero-filling operations

With this patch:

1. Timer initialization time can be reduced by 4/10 second

2. Memory initialization time can be reduced nearly by half

The 2nd topic has been brought up before in this thread:
http://dpdk.org/dev/patchwork/patch/4219/

Zhihong Wang (2):
  lib/librte_eal: Reduce timer initialization time
  lib/librte_eal: Remove unnecessary hugepage zero-filling

 lib/librte_eal/linuxapp/eal/eal_memory.c | 5 +
 lib/librte_eal/linuxapp/eal/eal_timer.c  | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

-- 
2.5.0



[dpdk-dev] [PATCH v2 4/4] lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms

2015-01-29 Thread Zhihong Wang
Main code changes:

1. Differentiate architectural features based on CPU flags

a. Implement separated move functions for SSE/AVX/AVX2 to make full 
utilization of cache bandwidth

b. Implement separated copy flow specifically optimized for target 
architecture

2. Rewrite the memcpy function "rte_memcpy"

a. Add store aligning

b. Add load aligning based on architectural features

c. Put block copy loop into inline move functions for better control of 
instruction order

d. Eliminate unnecessary MOVs

3. Rewrite the inline move functions

a. Add move functions for unaligned load cases

b. Change instruction order in copy loops for better pipeline utilization

c. Use intrinsics instead of assembly code

4. Remove slow glibc call for constant copies

Signed-off-by: Zhihong Wang 
---
 .../common/include/arch/x86/rte_memcpy.h   | 680 +++--
 1 file changed, 509 insertions(+), 171 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index fb9eba8..7b2d382 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -34,166 +34,189 @@
 #ifndef _RTE_MEMCPY_X86_64_H_
 #define _RTE_MEMCPY_X86_64_H_

+/**
+ * @file
+ *
+ * Functions for SSE/AVX/AVX2 implementation of memcpy().
+ */
+
+#include 
 #include 
 #include 
-#include 
+#include 

 #ifdef __cplusplus
 extern "C" {
 #endif

-#include "generic/rte_memcpy.h"
+/**
+ * Copy bytes from one location to another. The locations must not overlap.
+ *
+ * @note This is implemented as a macro, so it's address should not be taken
+ * and care is needed as parameter expressions may be evaluated multiple times.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ * @param n
+ *   Number of bytes to copy.
+ * @return
+ *   Pointer to the destination data.
+ */
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n) 
__attribute__((always_inline));

-#ifdef __INTEL_COMPILER
-#pragma warning(disable:593) /* Stop unused variable warning (reg_a etc). */
-#endif
+#ifdef RTE_MACHINE_CPUFLAG_AVX2

+/**
+ * AVX2 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
-   __m128i reg_a;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   : [reg_a] "=x" (reg_a)
-   : [src] "r" (src),
- [dst] "r"(dst)
-   : "memory"
-   );
+   __m128i xmm0;
+
+   xmm0 = _mm_loadu_si128((const __m128i *)src);
+   _mm_storeu_si128((__m128i *)dst, xmm0);
 }

+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
-   __m128i reg_a, reg_b;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu 16(%[src]), %[reg_b]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   "movdqu %[reg_b], 16(%[dst])\n\t"
-   : [reg_a] "=x" (reg_a),
- [reg_b] "=x" (reg_b)
-   : [src] "r" (src),
- [dst] "r"(dst)
-   : "memory"
-   );
-}
+   __m256i ymm0;

-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-   __m128i reg_a, reg_b, reg_c;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu 16(%[src]), %[reg_b]\n\t"
-   "movdqu 32(%[src]), %[reg_c]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   "movdqu %[reg_b], 16(%[dst])\n\t"
-   "movdqu %[reg_c], 32(%[dst])\n\t"
-   : [reg_a] "=x" (reg_a),
- [reg_b] "=x" (reg_b),
- [reg_c] "=x" (reg_c)
-   : [src] "r" (src),
- [dst] "r"(dst)
-   : "memory"
-   );
+   ymm0 = _mm256_loadu_si256((const __m256i *)src);
+   _mm256_storeu_si256((__m256i *)dst, ymm0);
 }

+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
-   __m128i reg_a, reg_b, reg_c, reg_d;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu 16(%[src]), %[reg_b]\n\t"
-   "movdqu 32(%[src]), %[reg_c]\n\t"
-   "movdqu 48(%[s

  1   2   >