[dpdk-dev] [PATCH v2 19/19] vhost: make buf vector for scatter Rx local

2016-05-12 Thread Yuanhan Liu
From: Ilya Maximets 

Array of buf_vector's is just an array for temporary storing information
about available descriptors. It used only locally in virtio_dev_merge_rx()
and there is no reason for that array to be shared.

Fix that by allocating local buf_vec inside virtio_dev_merge_rx().

Signed-off-by: Ilya Maximets 
Signed-off-by: Yuanhan Liu 
---
 lib/librte_vhost/vhost-net.h  |  1 -
 lib/librte_vhost/vhost_rxtx.c | 41 ++---
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 590a039..162ad04 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -87,7 +87,6 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_tlog_guest_addr;
-   struct buf_vector   buf_vec[BUF_VECTOR_MAX];
 } __rte_cache_aligned;

 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index c9cd1c5..96720db 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -335,7 +335,8 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,

 static inline int
 fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
-uint32_t *allocated, uint32_t *vec_idx)
+uint32_t *allocated, uint32_t *vec_idx,
+struct buf_vector *buf_vec)
 {
uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
uint32_t vec_id = *vec_idx;
@@ -346,9 +347,9 @@ fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
return -1;

len += vq->desc[idx].len;
-   vq->buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
-   vq->buf_vec[vec_id].buf_len  = vq->desc[idx].len;
-   vq->buf_vec[vec_id].desc_idx = idx;
+   buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
+   buf_vec[vec_id].buf_len  = vq->desc[idx].len;
+   buf_vec[vec_id].desc_idx = idx;
vec_id++;

if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
@@ -371,7 +372,8 @@ fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
  */
 static inline int
 reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-   uint16_t *start, uint16_t *end)
+   uint16_t *start, uint16_t *end,
+   struct buf_vector *buf_vec)
 {
uint16_t res_start_idx;
uint16_t res_cur_idx;
@@ -393,7 +395,7 @@ again:
return -1;

if (unlikely(fill_vec_buf(vq, res_cur_idx, ,
- _idx) < 0))
+ _idx, buf_vec) < 0))
return -1;

res_cur_idx++;
@@ -427,7 +429,7 @@ again:
 static inline uint32_t __attribute__((always_inline))
 copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint16_t res_start_idx, uint16_t res_end_idx,
-   struct rte_mbuf *m)
+   struct rte_mbuf *m, struct buf_vector *buf_vec)
 {
struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
uint32_t vec_idx = 0;
@@ -444,10 +446,10 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, 
struct vhost_virtqueue *vq,
LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
dev->vid, cur_idx, res_end_idx);

-   if (vq->buf_vec[vec_idx].buf_len < dev->vhost_hlen)
+   if (buf_vec[vec_idx].buf_len < dev->vhost_hlen)
return -1;

-   desc_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
+   desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
rte_prefetch0((void *)(uintptr_t)desc_addr);

virtio_hdr.num_buffers = res_end_idx - res_start_idx;
@@ -456,10 +458,10 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, 
struct vhost_virtqueue *vq,

virtio_enqueue_offload(m, _hdr.hdr);
copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-   vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
+   vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);

-   desc_avail  = vq->buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+   desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
desc_offset = dev->vhost_hlen;

mbuf_avail  = rte_pktmbuf_data_len(m);
@@ -467,7 +469,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
while (mbuf_avail != 0 || m->next != NULL) {
/* done with current desc buf, get the next one */
if (desc_avail == 0) {
-   desc_idx = vq->buf_vec[vec_idx].desc_idx;
+

[dpdk-dev] [PATCH v2 18/19] vhost: per device virtio net header len

2016-05-12 Thread Yuanhan Liu
Virtio net header length is set per device, but not per queue. So, there
is no reason to store it in vhost_virtqueue struct, instead, we should
store it in virtio_net struct, to make one copy only.

Signed-off-by: Yuanhan Liu 
---
 lib/librte_vhost/vhost-net.h  |  2 +-
 lib/librte_vhost/vhost_rxtx.c | 40 
 lib/librte_vhost/virtio-net.c | 13 ++---
 3 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index dbd2d62..590a039 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -69,7 +69,6 @@ struct vhost_virtqueue {
struct vring_avail  *avail;
struct vring_used   *used;
uint32_tsize;
-   uint16_tvhost_hlen;

/* Last index used on the available ring */
volatile uint16_t   last_used_idx;
@@ -129,6 +128,7 @@ struct virtio_net {
uint64_tprotocol_features;
int vid;
uint32_tflags;
+   uint16_tvhost_hlen;
 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
charifname[IF_NAME_SZ];
uint32_tvirt_qp_nb;
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 65278bb..c9cd1c5 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -126,10 +126,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
 }

 static inline void
-copy_virtio_net_hdr(struct vhost_virtqueue *vq, uint64_t desc_addr,
+copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
struct virtio_net_hdr_mrg_rxbuf hdr)
 {
-   if (vq->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
+   if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
else
*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
@@ -147,19 +147,19 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};

desc = >desc[desc_idx];
-   if (unlikely(desc->len < vq->vhost_hlen))
+   if (unlikely(desc->len < dev->vhost_hlen))
return -1;

desc_addr = gpa_to_vva(dev, desc->addr);
rte_prefetch0((void *)(uintptr_t)desc_addr);

virtio_enqueue_offload(m, _hdr.hdr);
-   copy_virtio_net_hdr(vq, desc_addr, virtio_hdr);
-   vhost_log_write(dev, desc->addr, vq->vhost_hlen);
-   PRINT_PACKET(dev, (uintptr_t)desc_addr, vq->vhost_hlen, 0);
+   copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+   vhost_log_write(dev, desc->addr, dev->vhost_hlen);
+   PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);

-   desc_offset = vq->vhost_hlen;
-   desc_avail  = desc->len - vq->vhost_hlen;
+   desc_offset = dev->vhost_hlen;
+   desc_avail  = desc->len - dev->vhost_hlen;

*copied = rte_pktmbuf_pkt_len(m);
mbuf_avail  = rte_pktmbuf_data_len(m);
@@ -300,9 +300,9 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,

vq->used->ring[used_idx].id = desc_idx;
if (unlikely(err))
-   vq->used->ring[used_idx].len = vq->vhost_hlen;
+   vq->used->ring[used_idx].len = dev->vhost_hlen;
else
-   vq->used->ring[used_idx].len = copied + vq->vhost_hlen;
+   vq->used->ring[used_idx].len = copied + dev->vhost_hlen;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used, ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
@@ -444,7 +444,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
dev->vid, cur_idx, res_end_idx);

-   if (vq->buf_vec[vec_idx].buf_len < vq->vhost_hlen)
+   if (vq->buf_vec[vec_idx].buf_len < dev->vhost_hlen)
return -1;

desc_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
@@ -455,12 +455,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, 
struct vhost_virtqueue *vq,
dev->vid, virtio_hdr.num_buffers);

virtio_enqueue_offload(m, _hdr.hdr);
-   copy_virtio_net_hdr(vq, desc_addr, virtio_hdr);
-   vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr, vq->vhost_hlen);
-   PRINT_PACKET(dev, (uintptr_t)desc_addr, vq->vhost_hlen, 0);
+   copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+   vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
+   PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);

-   desc_avail  = vq->buf_vec[vec_idx].buf_len - 

[dpdk-dev] [PATCH v2 17/19] vhost: reserve few more space for future extension

2016-05-12 Thread Yuanhan Liu
"virtio_net_device_ops" is the only left open struct that an application
can access, therefore, it's the only place that might introduce potential
ABI break in future for extension.

So, do some reservation for it. 5 should be pretty enough, considering
that we have barely touched it for a long while. Another reason to
choose 5 is for cache alignment: 5 makes the struct 64 bytes for 64 bit
machine.

With this, it's confidence to say that we might be able to be free from
the ABI violation forever.

Signed-off-by: Yuanhan Liu 
---
 lib/librte_vhost/rte_virtio_net.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/librte_vhost/rte_virtio_net.h 
b/lib/librte_vhost/rte_virtio_net.h
index fc1d799..bc2b74b 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -66,6 +66,8 @@ struct virtio_net_device_ops {
void (*destroy_device)(int vid);/**< Remove device. */

int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); 
/**< triggered when a vring is enabled or disabled */
+
+   void *reserved[5]; /**< Reserved for future extension */
 };

 /**
-- 
1.9.0



[dpdk-dev] [PATCH v2 16/19] vhost: remove virtio-net.h

2016-05-12 Thread Yuanhan Liu
It barely has anything useful there, just 2 functions prototype. Here
move them to vhost-net.h, and delete it.

Signed-off-by: Yuanhan Liu 
---
 lib/librte_vhost/vhost-net.h  |  3 ++
 lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |  1 -
 lib/librte_vhost/vhost_rxtx.c |  1 -
 lib/librte_vhost/vhost_user/virtio-net-user.c |  1 -
 lib/librte_vhost/virtio-net.c |  1 -
 lib/librte_vhost/virtio-net.h | 43 ---
 6 files changed, 3 insertions(+), 47 deletions(-)
 delete mode 100644 lib/librte_vhost/virtio-net.h

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index c133ea8..dbd2d62 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -220,6 +220,9 @@ gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
return vhost_va;
 }

+struct virtio_net_device_ops const *notify_ops;
+struct virtio_net *get_device(int vid);
+
 int vhost_new_device(void);
 void vhost_destroy_device(int);

diff --git a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c 
b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
index 0723a7a..552be7d 100644
--- a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
+++ b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
@@ -54,7 +54,6 @@
 #include "rte_virtio_net.h"
 #include "vhost-net.h"
 #include "virtio-net-cdev.h"
-#include "virtio-net.h"
 #include "eventfd_copy.h"

 /* Line size for reading maps file. */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08cab08..65278bb 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -46,7 +46,6 @@
 #include 

 #include "vhost-net.h"
-#include "virtio-net.h"

 #define MAX_PKT_BURST 32
 #define VHOST_LOG_PAGE 4096
diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c 
b/lib/librte_vhost/vhost_user/virtio-net-user.c
index 7fa69a7..6463bdd 100644
--- a/lib/librte_vhost/vhost_user/virtio-net-user.c
+++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
@@ -43,7 +43,6 @@
 #include 
 #include 

-#include "virtio-net.h"
 #include "virtio-net-user.h"
 #include "vhost-net-user.h"
 #include "vhost-net.h"
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index ea216c0..13dc021 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -53,7 +53,6 @@
 #include 

 #include "vhost-net.h"
-#include "virtio-net.h"

 #define MAX_VHOST_DEVICE   1024
 static struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
diff --git a/lib/librte_vhost/virtio-net.h b/lib/librte_vhost/virtio-net.h
deleted file mode 100644
index 9812545..000
--- a/lib/librte_vhost/virtio-net.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in
- *   the documentation and/or other materials provided with the
- *   distribution.
- * * Neither the name of Intel Corporation nor the names of its
- *   contributors may be used to endorse or promote products derived
- *   from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _VIRTIO_NET_H
-#define _VIRTIO_NET_H
-
-#include "vhost-net.h"
-#include "rte_virtio_net.h"
-
-struct virtio_net_device_ops const *notify_ops;
-struct virtio_net *get_device(int vid);
-
-#endif
-- 
1.9.0



[dpdk-dev] [PATCH v2 15/19] vhost: remove unnecessary fields

2016-05-12 Thread Yuanhan Liu
The "reserved" field in virtio_net and vhost_virtqueue struct is not
necessary any more. We now expose virtio_net device with a number "vid".

This patch also removes the "priv" field: all fields are priviate now:
application can't access it now. The only way that we could still access
it is to expose it by a function, but I doubt that's needed or worthwhile.

Signed-off-by: Yuanhan Liu 
---
 lib/librte_vhost/vhost-net.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 3edeb92..c133ea8 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -88,7 +88,6 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_tlog_guest_addr;
-   uint64_treserved[15];
struct buf_vector   buf_vec[BUF_VECTOR_MAX];
 } __rte_cache_aligned;

@@ -133,14 +132,12 @@ struct virtio_net {
 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
charifname[IF_NAME_SZ];
uint32_tvirt_qp_nb;
-   void*priv;
uint64_tlog_size;
uint64_tlog_base;
struct ether_addr   mac;

/* to tell if we need broadcast rarp packet */
rte_atomic16_t  broadcast_rarp;
-   uint64_treserved[61];
struct vhost_virtqueue  *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
 } __rte_cache_aligned;

-- 
1.9.0



[dpdk-dev] [PATCH v2 14/19] vhost: hide internal structs/macros/functions

2016-05-12 Thread Yuanhan Liu
We are now safe to move all those internal structs/macros/functions to
vhost-net.h, to hide them from external access.

This patch also breaks long lines and removes some redundant comments.

Signed-off-by: Yuanhan Liu 
---
 lib/librte_vhost/rte_virtio_net.h| 139 -
 lib/librte_vhost/vhost-net.h | 148 +++
 lib/librte_vhost/vhost_user/vhost-net-user.h |   2 +
 3 files changed, 150 insertions(+), 139 deletions(-)

diff --git a/lib/librte_vhost/rte_virtio_net.h 
b/lib/librte_vhost/rte_virtio_net.h
index 370345e..fc1d799 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -51,125 +51,9 @@
 #include 
 #include 

-struct rte_mbuf;
-
-#define VHOST_MEMORY_MAX_NREGIONS 8
-
-/* Used to indicate that the device is running on a data core */
-#define VIRTIO_DEV_RUNNING 1
-
-/* Backend value set by guest. */
-#define VIRTIO_DEV_STOPPED -1
-
-
 /* Enum for virtqueue management. */
 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};

-#define BUF_VECTOR_MAX 256
-
-/**
- * Structure contains buffer address, length and descriptor index
- * from vring to do scatter RX.
- */
-struct buf_vector {
-   uint64_t buf_addr;
-   uint32_t buf_len;
-   uint32_t desc_idx;
-};
-
-/**
- * Structure contains variables relevant to RX/TX virtqueues.
- */
-struct vhost_virtqueue {
-   struct vring_desc   *desc;  /**< Virtqueue 
descriptor ring. */
-   struct vring_avail  *avail; /**< Virtqueue 
available ring. */
-   struct vring_used   *used;  /**< Virtqueue used 
ring. */
-   uint32_tsize;   /**< Size of descriptor 
ring. */
-   int backend;/**< Backend value to 
determine if device should started/stopped. */
-   uint16_tvhost_hlen; /**< Vhost header 
length (varies depending on RX merge buffers. */
-   volatile uint16_t   last_used_idx;  /**< Last index used on 
the available ring */
-   volatile uint16_t   last_used_idx_res;  /**< Used for multiple 
devices reserving buffers. */
-#define VIRTIO_INVALID_EVENTFD (-1)
-#define VIRTIO_UNINITIALIZED_EVENTFD   (-2)
-   int callfd; /**< Used to notify the 
guest (trigger interrupt). */
-   int kickfd; /**< Currently unused 
as polling mode is enabled. */
-   int enabled;
-   uint64_tlog_guest_addr; /**< Physical address 
of used ring, for logging */
-   uint64_treserved[15];   /**< Reserve some 
spaces for future extension. */
-   struct buf_vector   buf_vec[BUF_VECTOR_MAX];/**< for 
scatter RX. */
-} __rte_cache_aligned;
-
-/* Old kernels have no such macro defined */
-#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
- #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
-#endif
-
-
-/*
- * Make an extra wrapper for VIRTIO_NET_F_MQ and
- * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are
- * introduced since kernel v3.8. This makes our
- * code buildable for older kernel.
- */
-#ifdef VIRTIO_NET_F_MQ
- #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX
- #define VHOST_SUPPORTS_MQ (1ULL << VIRTIO_NET_F_MQ)
-#else
- #define VHOST_MAX_QUEUE_PAIRS 1
- #define VHOST_SUPPORTS_MQ 0
-#endif
-
-/*
- * Define virtio 1.0 for older kernels
- */
-#ifndef VIRTIO_F_VERSION_1
- #define VIRTIO_F_VERSION_1 32
-#endif
-
-/**
- * Device structure contains all configuration information relating to the 
device.
- */
-struct virtio_net {
-   struct virtio_memory*mem;   /**< QEMU memory and memory 
region information. */
-   uint64_tfeatures;   /**< Negotiated feature set. */
-   uint64_tprotocol_features;  /**< Negotiated 
protocol feature set. */
-   int vid;/**< device identifier. */
-   uint32_tflags;  /**< Device flags. Only used to 
check if device is running on data core. */
-#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
-   charifname[IF_NAME_SZ]; /**< Name of the tap 
device or socket path. */
-   uint32_tvirt_qp_nb; /**< number of queue pair we 
have allocated */
-   void*priv;  /**< private context */
-   uint64_tlog_size;   /**< Size of log area */
-   uint64_tlog_base;   /**< Where dirty pages are 
logged */
-   struct ether_addr   mac;/**< MAC address */
-   rte_atomic16_t  broadcast_rarp; /**< A flag to tell if we need 
broadcast rarp packet */
-   uint64_treserved[61];   /**< Reserve some spaces for 
future extension. */
-   struct vhost_virtqueue  *virtqueue[VHOST_MAX_QUEUE_PAIRS * 

[dpdk-dev] [PATCH v2 13/19] vhost: export vid as the only interface to applications

2016-05-12 Thread Yuanhan Liu
With all the previous prepare works, we are just one step away from
the final ABI refactoring. That is, to change current API to let them
stick to vid instead of the old virtio_net dev.

Signed-off-by: Yuanhan Liu 
---

v2: update release note
---
 doc/guides/rel_notes/release_16_07.rst|  7 
 drivers/net/vhost/rte_eth_vhost.c | 47 +--
 examples/vhost/main.c | 25 +++---
 lib/librte_vhost/rte_virtio_net.h | 18 +-
 lib/librte_vhost/vhost_rxtx.c | 15 +++--
 lib/librte_vhost/vhost_user/virtio-net-user.c | 14 
 lib/librte_vhost/virtio-net.c | 17 ++
 7 files changed, 75 insertions(+), 68 deletions(-)

diff --git a/doc/guides/rel_notes/release_16_07.rst 
b/doc/guides/rel_notes/release_16_07.rst
index d293eda..ebc507b 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -99,6 +99,10 @@ This section should contain API changes. Sample format:

 * ``rte_vring_available_entries`` is renamed to ``rte_vhost_avail_entries``.

+* All existing vhost APIs and callbacks with ``virtio_net`` struct pointer
+  as the parameter have been changed due to the ABI refactoring mentioned
+  below: it's replaced by "int vid".
+

 ABI Changes
 ---
@@ -110,6 +114,9 @@ ABI Changes
 * The ``rte_port_source_params`` structure has new fields to support PCAP file.
   It was already in release 16.04 with ``RTE_NEXT_ABI`` flag.

+* vhost ABI refactoring has been made: ``virtio_net`` structure is never
+  exported to application any more. Instead, a handle, ``vid``, has been
+  used to represent this structure internally.

 Shared Library Versions
 ---
diff --git a/drivers/net/vhost/rte_eth_vhost.c 
b/drivers/net/vhost/rte_eth_vhost.c
index de0f25e..56c1c36 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -71,9 +71,9 @@ static struct ether_addr base_eth_addr = {
 };

 struct vhost_queue {
+   int vid;
rte_atomic32_t allow_queuing;
rte_atomic32_t while_queuing;
-   struct virtio_net *device;
struct pmd_internal *internal;
struct rte_mempool *mb_pool;
uint8_t port;
@@ -139,7 +139,7 @@ eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t 
nb_bufs)
goto out;

/* Dequeue packets from guest TX queue */
-   nb_rx = rte_vhost_dequeue_burst(r->device,
+   nb_rx = rte_vhost_dequeue_burst(r->vid,
r->virtqueue_id, r->mb_pool, bufs, nb_bufs);

r->rx_pkts += nb_rx;
@@ -170,7 +170,7 @@ eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t 
nb_bufs)
goto out;

/* Enqueue packets to guest RX queue */
-   nb_tx = rte_vhost_enqueue_burst(r->device,
+   nb_tx = rte_vhost_enqueue_burst(r->vid,
r->virtqueue_id, bufs, nb_bufs);

r->tx_pkts += nb_tx;
@@ -222,7 +222,7 @@ find_internal_resource(char *ifname)
 }

 static int
-new_device(struct virtio_net *dev)
+new_device(int vid)
 {
struct rte_eth_dev *eth_dev;
struct internal_list *list;
@@ -234,12 +234,7 @@ new_device(struct virtio_net *dev)
int newnode;
 #endif

-   if (dev == NULL) {
-   RTE_LOG(INFO, PMD, "Invalid argument\n");
-   return -1;
-   }
-
-   rte_vhost_get_ifname(dev->vid, ifname, sizeof(ifname));
+   rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
list = find_internal_resource(ifname);
if (list == NULL) {
RTE_LOG(INFO, PMD, "Invalid device name: %s\n", ifname);
@@ -250,7 +245,7 @@ new_device(struct virtio_net *dev)
internal = eth_dev->data->dev_private;

 #ifdef RTE_LIBRTE_VHOST_NUMA
-   newnode = rte_vhost_get_numa_node(dev->vid);
+   newnode = rte_vhost_get_numa_node(vid);
if (newnode > 0)
eth_dev->data->numa_node = newnode;
 #endif
@@ -259,7 +254,7 @@ new_device(struct virtio_net *dev)
vq = eth_dev->data->rx_queues[i];
if (vq == NULL)
continue;
-   vq->device = dev;
+   vq->vid = vid;
vq->internal = internal;
vq->port = eth_dev->data->port_id;
}
@@ -267,13 +262,13 @@ new_device(struct virtio_net *dev)
vq = eth_dev->data->tx_queues[i];
if (vq == NULL)
continue;
-   vq->device = dev;
+   vq->vid = vid;
vq->internal = internal;
vq->port = eth_dev->data->port_id;
}

-   for (i = 0; i < rte_vhost_get_queue_num(dev->vid) * VIRTIO_QNUM; i++)
-   rte_vhost_enable_guest_notification(dev, i, 0);
+   for (i = 0; i < rte_vhost_get_queue_num(vid) * VIRTIO_QNUM; i++)
+   rte_vhost_enable_guest_notification(vid, i, 0);

eth_dev->data->dev_link.link_status = 

[dpdk-dev] [PATCH v2 12/19] vhost: remove dependency on priv field

2016-05-12 Thread Yuanhan Liu
This change could let us avoid the dependency of "virtio_net"
struct, to prepare for the ABI refactoring.

Signed-off-by: Yuanhan Liu 
---
 drivers/net/vhost/rte_eth_vhost.c | 13 +++--
 examples/vhost/main.c | 18 --
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/drivers/net/vhost/rte_eth_vhost.c 
b/drivers/net/vhost/rte_eth_vhost.c
index 6fa9f6b..de0f25e 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -275,7 +275,6 @@ new_device(struct virtio_net *dev)
for (i = 0; i < rte_vhost_get_queue_num(dev->vid) * VIRTIO_QNUM; i++)
rte_vhost_enable_guest_notification(dev, i, 0);

-   dev->priv = eth_dev;
eth_dev->data->dev_link.link_status = ETH_LINK_UP;

for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
@@ -303,6 +302,8 @@ destroy_device(volatile struct virtio_net *dev)
 {
struct rte_eth_dev *eth_dev;
struct vhost_queue *vq;
+   struct internal_list *list;
+   char ifname[PATH_MAX];
unsigned i;

if (dev == NULL) {
@@ -310,11 +311,13 @@ destroy_device(volatile struct virtio_net *dev)
return;
}

-   eth_dev = (struct rte_eth_dev *)dev->priv;
-   if (eth_dev == NULL) {
-   RTE_LOG(INFO, PMD, "Failed to find a ethdev\n");
+   rte_vhost_get_ifname(dev->vid, ifname, sizeof(ifname));
+   list = find_internal_resource(ifname);
+   if (list == NULL) {
+   RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname);
return;
}
+   eth_dev = list->eth_dev;

/* Wait until rx/tx_pkt_burst stops accessing vhost device */
for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
@@ -336,8 +339,6 @@ destroy_device(volatile struct virtio_net *dev)

eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;

-   dev->priv = NULL;
-
for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
vq = eth_dev->data->rx_queues[i];
if (vq == NULL)
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index 67ef0ad..c408577 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -700,6 +700,19 @@ find_vhost_dev(struct ether_addr *mac)
return NULL;
 }

+static inline struct vhost_dev *__attribute__((always_inline))
+find_vhost_dev_by_vid(int vid)
+{
+   struct vhost_dev *vdev;
+
+   TAILQ_FOREACH(vdev, _dev_list, next) {
+   if (vdev->ready == DEVICE_RX && vdev->vid == vid)
+   return vdev;
+   }
+
+   return NULL;
+}
+
 /*
  * This function learns the MAC address of the device and registers this along 
with a
  * vlan tag to a VMDQ.
@@ -1175,7 +1188,9 @@ destroy_device (volatile struct virtio_net *dev)
struct vhost_dev *vdev;
int lcore;

-   vdev = (struct vhost_dev *)dev->priv;
+   vdev = find_vhost_dev_by_vid(dev->vid);
+   if (!vdev)
+   return;
/*set the remove flag. */
vdev->remove = 1;
while(vdev->ready != DEVICE_SAFE_REMOVE) {
@@ -1228,7 +1243,6 @@ new_device (struct virtio_net *dev)
return -1;
}
vdev->dev = dev;
-   dev->priv = vdev;
vdev->vid = vid;

TAILQ_INSERT_TAIL(_dev_list, vdev, next);
-- 
1.9.0



[dpdk-dev] [PATCH v2 11/19] vhost: introduce new API to export queue free entries

2016-05-12 Thread Yuanhan Liu
The new API rte_vhost_avail_entries() is actually a rename of
rte_vring_available_entries(), with the "vring" to "vhost" name
change to keep the consistency of other vhost exported APIs.

This change could let us avoid the dependency of "virtio_net"
struct, to prepare for the ABI refactoring.

Signed-off-by: Yuanhan Liu 
---
 doc/guides/rel_notes/release_16_07.rst |  2 ++
 examples/vhost/main.c  |  4 ++--
 lib/librte_vhost/rte_vhost_version.map |  1 +
 lib/librte_vhost/rte_virtio_net.h  | 24 +---
 lib/librte_vhost/virtio-net.c  | 17 +
 5 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/doc/guides/rel_notes/release_16_07.rst 
b/doc/guides/rel_notes/release_16_07.rst
index f6d543c..d293eda 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -97,6 +97,8 @@ This section should contain API changes. Sample format:
   ibadcrc, ibadlen, imcasts, fdirmatch, fdirmiss,
   tx_pause_xon, rx_pause_xon, tx_pause_xoff, rx_pause_xoff.

+* ``rte_vring_available_entries`` is renamed to ``rte_vhost_avail_entries``.
+

 ABI Changes
 ---
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index cb04585..67ef0ad 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -1055,13 +1055,13 @@ drain_eth_rx(struct vhost_dev *vdev)
 * to diminish packet loss.
 */
if (enable_retry &&
-   unlikely(rx_count > rte_vring_available_entries(dev,
+   unlikely(rx_count > rte_vhost_avail_entries(dev->vid,
VIRTIO_RXQ))) {
uint32_t retry;

for (retry = 0; retry < burst_rx_retry_num; retry++) {
rte_delay_us(burst_rx_delay_time);
-   if (rx_count <= rte_vring_available_entries(dev,
+   if (rx_count <= rte_vhost_avail_entries(dev->vid,
VIRTIO_RXQ))
break;
}
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index 4608e3b..93f1188 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -24,6 +24,7 @@ DPDK_2.1 {
 DPDK_16.07 {
global:

+   rte_vhost_avail_entries;
rte_vhost_get_ifname;
rte_vhost_get_numa_node;
rte_vhost_get_queue_num;
diff --git a/lib/librte_vhost/rte_virtio_net.h 
b/lib/librte_vhost/rte_virtio_net.h
index 0898e8b..0427461 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -184,17 +184,6 @@ struct virtio_net_device_ops {
int (*vring_state_changed)(struct virtio_net *dev, uint16_t queue_id, 
int enable);  /**< triggered when a vring is enabled or disabled */
 };

-static inline uint16_t __attribute__((always_inline))
-rte_vring_available_entries(struct virtio_net *dev, uint16_t queue_id)
-{
-   struct vhost_virtqueue *vq = dev->virtqueue[queue_id];
-
-   if (!vq->enabled)
-   return 0;
-
-   return *(volatile uint16_t *)>avail->idx - vq->last_used_idx_res;
-}
-
 /**
  * Function to convert guest physical addresses to vhost virtual addresses.
  * This is used to convert guest virtio buffer addresses.
@@ -285,6 +274,19 @@ uint32_t rte_vhost_get_queue_num(int vid);
 int rte_vhost_get_ifname(int vid, char *buf, size_t len);

 /**
+ * Get how many avail entries are left in the queue
+ *
+ * @param vid
+ *  virtio-net device ID
+ * @param queue_id
+ *  virtio queue index
+ *
+ * @return
+ *  num of avail entires left
+ */
+uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
+
+/**
  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
  * be received from the physical port or from another virtual device. A packet
  * count is returned to indicate the number of packets that were succesfully
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 375c9d4..115eba4 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -783,6 +783,23 @@ rte_vhost_get_ifname(int vid, char *buf, size_t len)
return 0;
 }

+uint16_t
+rte_vhost_avail_entries(int vid, uint16_t queue_id)
+{
+   struct virtio_net *dev;
+   struct vhost_virtqueue *vq;
+
+   dev = get_device(vid);
+   if (!dev)
+   return 0;
+
+   vq = dev->virtqueue[queue_id];
+   if (!vq->enabled)
+   return 0;
+
+   return *(volatile uint16_t *)>avail->idx - vq->last_used_idx_res;
+}
+
 int rte_vhost_enable_guest_notification(struct virtio_net *dev,
uint16_t queue_id, int enable)
 {
-- 
1.9.0



[dpdk-dev] [PATCH v2 10/19] vhost: introduce new API to export ifname

2016-05-12 Thread Yuanhan Liu
Introduce a new API rte_vhost_get_ifname() to export the ifname to
application.

Signed-off-by: Yuanhan Liu 
---
 drivers/net/vhost/rte_eth_vhost.c  | 12 
 lib/librte_vhost/rte_vhost_version.map |  1 +
 lib/librte_vhost/rte_virtio_net.h  | 17 +
 lib/librte_vhost/virtio-net.c  | 16 
 4 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/drivers/net/vhost/rte_eth_vhost.c 
b/drivers/net/vhost/rte_eth_vhost.c
index fe0ce90..6fa9f6b 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -229,6 +229,7 @@ new_device(struct virtio_net *dev)
struct pmd_internal *internal;
struct vhost_queue *vq;
unsigned i;
+   char ifname[PATH_MAX];
 #ifdef RTE_LIBRTE_VHOST_NUMA
int newnode;
 #endif
@@ -238,9 +239,10 @@ new_device(struct virtio_net *dev)
return -1;
}

-   list = find_internal_resource(dev->ifname);
+   rte_vhost_get_ifname(dev->vid, ifname, sizeof(ifname));
+   list = find_internal_resource(ifname);
if (list == NULL) {
-   RTE_LOG(INFO, PMD, "Invalid device name\n");
+   RTE_LOG(INFO, PMD, "Invalid device name: %s\n", ifname);
return -1;
}

@@ -360,15 +362,17 @@ vring_state_changed(struct virtio_net *dev, uint16_t 
vring, int enable)
struct rte_vhost_vring_state *state;
struct rte_eth_dev *eth_dev;
struct internal_list *list;
+   char ifname[PATH_MAX];

if (dev == NULL) {
RTE_LOG(ERR, PMD, "Invalid argument\n");
return -1;
}

-   list = find_internal_resource(dev->ifname);
+   rte_vhost_get_ifname(dev->vid, ifname, sizeof(ifname));
+   list = find_internal_resource(ifname);
if (list == NULL) {
-   RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", dev->ifname);
+   RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname);
return -1;
}

diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index a65fa21..4608e3b 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -24,6 +24,7 @@ DPDK_2.1 {
 DPDK_16.07 {
global:

+   rte_vhost_get_ifname;
rte_vhost_get_numa_node;
rte_vhost_get_queue_num;

diff --git a/lib/librte_vhost/rte_virtio_net.h 
b/lib/librte_vhost/rte_virtio_net.h
index de56b1b..0898e8b 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -268,6 +268,23 @@ int rte_vhost_get_numa_node(int vid);
 uint32_t rte_vhost_get_queue_num(int vid);

 /**
+ * Get the virtio net device's ifname. For vhost-cuse, ifname is the
+ * path of the char device. For vhost-user, ifname is the vhost-user
+ * socket file path.
+ *
+ * @param vid
+ *  virtio-net device ID
+ * @param buf
+ *  The buffer to stored the queried ifname
+ * @param len
+ *  The length of buf
+ *
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_get_ifname(int vid, char *buf, size_t len);
+
+/**
  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
  * be received from the physical port or from another virtual device. A packet
  * count is returned to indicate the number of packets that were succesfully
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index a03ff30..375c9d4 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -767,6 +767,22 @@ rte_vhost_get_queue_num(int vid)
return dev->virt_qp_nb;
 }

+int
+rte_vhost_get_ifname(int vid, char *buf, size_t len)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (dev == NULL)
+   return -1;
+
+   len = RTE_MIN(len, sizeof(dev->ifname));
+
+   strncpy(buf, dev->ifname, len);
+   buf[len - 1] = '\0';
+
+   return 0;
+}
+
 int rte_vhost_enable_guest_notification(struct virtio_net *dev,
uint16_t queue_id, int enable)
 {
-- 
1.9.0



[dpdk-dev] [PATCH v2 09/19] vhost: introduce new API to export number of queues

2016-05-12 Thread Yuanhan Liu
Introduce a new API rte_vhost_get_queue_num() to export the number of
queues.

Signed-off-by: Yuanhan Liu 
---
 drivers/net/vhost/rte_eth_vhost.c  |  2 +-
 lib/librte_vhost/rte_vhost_version.map |  1 +
 lib/librte_vhost/rte_virtio_net.h  | 11 +++
 lib/librte_vhost/virtio-net.c  | 11 +++
 4 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/net/vhost/rte_eth_vhost.c 
b/drivers/net/vhost/rte_eth_vhost.c
index 204abff..fe0ce90 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -270,7 +270,7 @@ new_device(struct virtio_net *dev)
vq->port = eth_dev->data->port_id;
}

-   for (i = 0; i < dev->virt_qp_nb * VIRTIO_QNUM; i++)
+   for (i = 0; i < rte_vhost_get_queue_num(dev->vid) * VIRTIO_QNUM; i++)
rte_vhost_enable_guest_notification(dev, i, 0);

dev->priv = eth_dev;
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index bf7b000..a65fa21 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -25,5 +25,6 @@ DPDK_16.07 {
global:

rte_vhost_get_numa_node;
+   rte_vhost_get_queue_num;

 } DPDK_16.04;
diff --git a/lib/librte_vhost/rte_virtio_net.h 
b/lib/librte_vhost/rte_virtio_net.h
index b8e9b02..de56b1b 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -257,6 +257,17 @@ int rte_vhost_driver_session_start(void);
 int rte_vhost_get_numa_node(int vid);

 /**
+ * Get the number of queues the device supports.
+ *
+ * @param vid
+ *  virtio-net device ID
+ *
+ * @return
+ *  The number of queues, 0 on failure
+ */
+uint32_t rte_vhost_get_queue_num(int vid);
+
+/**
  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
  * be received from the physical port or from another virtual device. A packet
  * count is returned to indicate the number of packets that were succesfully
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 25b6515..a03ff30 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -756,6 +756,17 @@ rte_vhost_get_numa_node(int vid)
 #endif
 }

+uint32_t
+rte_vhost_get_queue_num(int vid)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (dev == NULL)
+   return 0;
+
+   return dev->virt_qp_nb;
+}
+
 int rte_vhost_enable_guest_notification(struct virtio_net *dev,
uint16_t queue_id, int enable)
 {
-- 
1.9.0



[dpdk-dev] [PATCH v2 08/19] vhost: introduce new API to export numa node

2016-05-12 Thread Yuanhan Liu
Introduce a new API rte_vhost_get_numa_node() to get the numa node
from which the virtio_net struct is allocated.

Signed-off-by: Yuanhan Liu 
---
 drivers/net/vhost/rte_eth_vhost.c  | 13 -
 lib/librte_vhost/rte_vhost_version.map |  7 +++
 lib/librte_vhost/rte_virtio_net.h  | 12 
 lib/librte_vhost/virtio-net.c  | 26 ++
 4 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/drivers/net/vhost/rte_eth_vhost.c 
b/drivers/net/vhost/rte_eth_vhost.c
index 63538c1..204abff 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -230,7 +230,7 @@ new_device(struct virtio_net *dev)
struct vhost_queue *vq;
unsigned i;
 #ifdef RTE_LIBRTE_VHOST_NUMA
-   int newnode, ret;
+   int newnode;
 #endif

if (dev == NULL) {
@@ -248,14 +248,9 @@ new_device(struct virtio_net *dev)
internal = eth_dev->data->dev_private;

 #ifdef RTE_LIBRTE_VHOST_NUMA
-   ret  = get_mempolicy(, NULL, 0, dev,
-   MPOL_F_NODE | MPOL_F_ADDR);
-   if (ret < 0) {
-   RTE_LOG(ERR, PMD, "Unknown numa node\n");
-   return -1;
-   }
-
-   eth_dev->data->numa_node = newnode;
+   newnode = rte_vhost_get_numa_node(dev->vid);
+   if (newnode > 0)
+   eth_dev->data->numa_node = newnode;
 #endif

for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index 3d8709e..bf7b000 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -20,3 +20,10 @@ DPDK_2.1 {
rte_vhost_driver_unregister;

 } DPDK_2.0;
+
+DPDK_16.07 {
+   global:
+
+   rte_vhost_get_numa_node;
+
+} DPDK_16.04;
diff --git a/lib/librte_vhost/rte_virtio_net.h 
b/lib/librte_vhost/rte_virtio_net.h
index bc64e89..b8e9b02 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -245,6 +245,18 @@ int rte_vhost_driver_callback_register(struct 
virtio_net_device_ops const * cons
 int rte_vhost_driver_session_start(void);

 /**
+ * Get the numa node from which the virtio net device's memory
+ * is allocated.
+ *
+ * @param vid
+ *  virtio-net device ID
+ *
+ * @return
+ *  The numa node, -1 on failure
+ */
+int rte_vhost_get_numa_node(int vid);
+
+/**
  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
  * be received from the physical port or from another virtual device. A packet
  * count is returned to indicate the number of packets that were succesfully
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index c6d3829..25b6515 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -730,6 +730,32 @@ vhost_set_backend(int vid, struct vhost_vring_file *file)
return 0;
 }

+int
+rte_vhost_get_numa_node(int vid)
+{
+#ifdef RTE_LIBRTE_VHOST_NUMA
+   struct virtio_net *dev = get_device(vid);
+   int numa_node;
+   int ret;
+
+   if (dev == NULL)
+   return -1;
+
+   ret = get_mempolicy(_node, NULL, 0, dev,
+   MPOL_F_NODE | MPOL_F_ADDR);
+   if (ret < 0) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "(%d) failed to query numa node: %d\n", vid, ret);
+   return -1;
+   }
+
+   return numa_node;
+#else
+   RTE_SET_USED(vid);
+   return -1;
+#endif
+}
+
 int rte_vhost_enable_guest_notification(struct virtio_net *dev,
uint16_t queue_id, int enable)
 {
-- 
1.9.0



[dpdk-dev] [PATCH v2 07/19] vhost: move vhost device ctx to cuse

2016-05-12 Thread Yuanhan Liu
vhost cuse is now the last reference of the vhost_device_ctx struct;
move it there, and do a rename to "vhost_cuse_device_ctx", to make
it clear that it's "cuse only".

Signed-off-by: Yuanhan Liu 
---
 lib/librte_vhost/vhost-net.h  |  8 
 lib/librte_vhost/vhost_cuse/vhost-net-cdev.c  | 13 +++--
 lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |  6 --
 lib/librte_vhost/vhost_cuse/virtio-net-cdev.h | 12 ++--
 4 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 4de3aa0..4ed5816 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -75,14 +75,6 @@
 #endif


-/*
- * Structure used to identify device context.
- */
-struct vhost_device_ctx {
-   pid_t   pid;/* PID of process calling the IOCTL. */
-   int vid;/* Virtio-net device ID */
-};
-
 int vhost_new_device(void);
 void vhost_destroy_device(int);

diff --git a/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c 
b/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c
index 45a9a91..cf6d191 100644
--- a/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c
+++ b/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c
@@ -60,13 +60,14 @@ static const char default_cdev[] = "vhost-net";
 static struct fuse_session *session;

 /*
- * Returns vhost_device_ctx from given fuse_req_t. The index is populated later
- * when the device is added to the device linked list.
+ * Returns vhost_cuse_device_ctx from given fuse_req_t. The
+ * index is populated later when the device is added to the
+ * device linked list.
  */
-static struct vhost_device_ctx
+static struct vhost_cuse_device_ctx
 fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info *fi)
 {
-   struct vhost_device_ctx ctx;
+   struct vhost_cuse_device_ctx ctx;
struct fuse_ctx const *const req_ctx = fuse_req_ctx(req);

ctx.pid = req_ctx->pid;
@@ -104,7 +105,7 @@ static void
 vhost_net_release(fuse_req_t req, struct fuse_file_info *fi)
 {
int err = 0;
-   struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi);
+   struct vhost_cuse_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi);

vhost_destroy_device(ctx.vid);
RTE_LOG(INFO, VHOST_CONFIG, "(%d) device released\n", ctx.vid);
@@ -182,7 +183,7 @@ vhost_net_ioctl(fuse_req_t req, int cmd, void *arg,
struct fuse_file_info *fi, __rte_unused unsigned flags,
const void *in_buf, size_t in_bufsz, size_t out_bufsz)
 {
-   struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi);
+   struct vhost_cuse_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi);
struct vhost_vring_file file;
struct vhost_vring_state state;
struct vhost_vring_addr addr;
diff --git a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c 
b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
index 34ee6c9..0723a7a 100644
--- a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
+++ b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
@@ -263,7 +263,7 @@ host_memory_map(pid_t pid, uint64_t addr,
 }

 int
-cuse_set_mem_table(struct vhost_device_ctx ctx,
+cuse_set_mem_table(struct vhost_cuse_device_ctx ctx,
const struct vhost_memory *mem_regions_addr, uint32_t nregions)
 {
uint64_t size = offsetof(struct vhost_memory, regions);
@@ -405,7 +405,9 @@ get_ifname(int vid, int tap_fd, int pid)
return 0;
 }

-int cuse_set_backend(struct vhost_device_ctx ctx, struct vhost_vring_file 
*file)
+int
+cuse_set_backend(struct vhost_cuse_device_ctx ctx,
+struct vhost_vring_file *file)
 {
struct virtio_net *dev;

diff --git a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.h 
b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.h
index eb6b0ba..3f67154 100644
--- a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.h
+++ b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.h
@@ -38,11 +38,19 @@

 #include "vhost-net.h"

+/*
+ * Structure used to identify device context.
+ */
+struct vhost_cuse_device_ctx {
+   pid_t   pid;/* PID of process calling the IOCTL. */
+   int vid;/* Virtio-net device ID */
+};
+
 int
-cuse_set_mem_table(struct vhost_device_ctx ctx,
+cuse_set_mem_table(struct vhost_cuse_device_ctx ctx,
const struct vhost_memory *mem_regions_addr, uint32_t nregions);

 int
-cuse_set_backend(struct vhost_device_ctx ctx, struct vhost_vring_file *);
+cuse_set_backend(struct vhost_cuse_device_ctx ctx, struct vhost_vring_file *);

 #endif
-- 
1.9.0



[dpdk-dev] [PATCH v2 06/19] vhost: get device by vid only

2016-05-12 Thread Yuanhan Liu
get_device() just needs vid, so pass vid as the parameter only.

Signed-off-by: Yuanhan Liu 
---
 lib/librte_vhost/vhost-net.h  | 30 ++--
 lib/librte_vhost/vhost_cuse/vhost-net-cdev.c  | 64 -
 lib/librte_vhost/vhost_cuse/virtio-net-cdev.c | 18 ---
 lib/librte_vhost/vhost_user/vhost-net-user.c  | 43 -
 lib/librte_vhost/vhost_user/virtio-net-user.c | 36 +++---
 lib/librte_vhost/vhost_user/virtio-net-user.h | 18 ---
 lib/librte_vhost/virtio-net.c | 68 +--
 lib/librte_vhost/virtio-net.h |  2 +-
 8 files changed, 132 insertions(+), 147 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index b63ea6f..4de3aa0 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -83,28 +83,26 @@ struct vhost_device_ctx {
int vid;/* Virtio-net device ID */
 };

-int vhost_new_device(struct vhost_device_ctx);
-void vhost_destroy_device(struct vhost_device_ctx);
+int vhost_new_device(void);
+void vhost_destroy_device(int);

-void vhost_set_ifname(struct vhost_device_ctx,
-   const char *if_name, unsigned int if_len);
+void vhost_set_ifname(int, const char *if_name, unsigned int if_len);

-int vhost_get_features(struct vhost_device_ctx, uint64_t *);
-int vhost_set_features(struct vhost_device_ctx, uint64_t *);
+int vhost_get_features(int, uint64_t *);
+int vhost_set_features(int, uint64_t *);

-int vhost_set_vring_num(struct vhost_device_ctx, struct vhost_vring_state *);
-int vhost_set_vring_addr(struct vhost_device_ctx, struct vhost_vring_addr *);
-int vhost_set_vring_base(struct vhost_device_ctx, struct vhost_vring_state *);
-int vhost_get_vring_base(struct vhost_device_ctx,
-   uint32_t, struct vhost_vring_state *);
+int vhost_set_vring_num(int, struct vhost_vring_state *);
+int vhost_set_vring_addr(int, struct vhost_vring_addr *);
+int vhost_set_vring_base(int, struct vhost_vring_state *);
+int vhost_get_vring_base(int, uint32_t, struct vhost_vring_state *);

-int vhost_set_vring_kick(struct vhost_device_ctx, struct vhost_vring_file *);
-int vhost_set_vring_call(struct vhost_device_ctx, struct vhost_vring_file *);
+int vhost_set_vring_kick(int, struct vhost_vring_file *);
+int vhost_set_vring_call(int, struct vhost_vring_file *);

-int vhost_set_backend(struct vhost_device_ctx, struct vhost_vring_file *);
+int vhost_set_backend(int, struct vhost_vring_file *);

-int vhost_set_owner(struct vhost_device_ctx);
-int vhost_reset_owner(struct vhost_device_ctx);
+int vhost_set_owner(int);
+int vhost_reset_owner(int);

 /*
  * Backend-specific cleanup. Defined by vhost-cuse and vhost-user.
diff --git a/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c 
b/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c
index 3a9b33d..45a9a91 100644
--- a/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c
+++ b/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c
@@ -82,19 +82,18 @@ fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info 
*fi)
 static void
 vhost_net_open(fuse_req_t req, struct fuse_file_info *fi)
 {
-   struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi);
-   int err = 0;
+   int vid = 0;

-   err = vhost_new_device(ctx);
-   if (err == -1) {
+   vid = vhost_new_device();
+   if (vid == -1) {
fuse_reply_err(req, EPERM);
return;
}

-   fi->fh = err;
+   fi->fh = vid;

RTE_LOG(INFO, VHOST_CONFIG,
-   "(%d) device configuration started\n", err);
+   "(%d) device configuration started\n", vid);
fuse_reply_open(req, fi);
 }

@@ -107,17 +106,17 @@ vhost_net_release(fuse_req_t req, struct fuse_file_info 
*fi)
int err = 0;
struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi);

-   vhost_destroy_device(ctx);
+   vhost_destroy_device(ctx.vid);
RTE_LOG(INFO, VHOST_CONFIG, "(%d) device released\n", ctx.vid);
fuse_reply_err(req, err);
 }

 /*
  * Boilerplate code for CUSE IOCTL
- * Implicit arguments: ctx, req, result.
+ * Implicit arguments: vid, req, result.
  */
 #define VHOST_IOCTL(func) do { \
-   result = (func)(ctx);   \
+   result = (func)(vid);   \
fuse_reply_ioctl(req, result, NULL, 0); \
 } while (0)

@@ -134,41 +133,41 @@ vhost_net_release(fuse_req_t req, struct fuse_file_info 
*fi)

 /*
  * Boilerplate code for CUSE Read IOCTL
- * Implicit arguments: ctx, req, result, in_bufsz, in_buf.
+ * Implicit arguments: vid, req, result, in_bufsz, in_buf.
  */
 #define VHOST_IOCTL_R(type, var, func) do {\
if (!in_bufsz) {\
VHOST_IOCTL_RETRY(sizeof(type), 0);\
} else {\
(var) = *(const type*)in_buf;   \
-   result = func(ctx, &(var)); \
+   result = func(vid, &(var)); \
fuse_reply_ioctl(req, result, NULL, 0);\
}   \
 } while (0)

 /*
  * 

[dpdk-dev] [PATCH v2 05/19] vhost: rename device fh to vid

2016-05-12 Thread Yuanhan Liu
I failed to figure out what does "fh" mean here for a long while.
The only guess I could have had is "file handle". So, you get the
point that it's not well named.

I then figured it out that "fh" is derived from the fuse lib, and
my above guess is right. However, device_fh represents a virtio
net device ID. Therefore, here I rename it to vid (Virtio-net device
ID, or Vhost device ID; choose one you prefer) to make it easier for
understanding.

This name (vid) then will be considered to the only interface to
applications. That's another reason to do the rename: it's our
interface, make it more understandable.

Signed-off-by: Yuanhan Liu 
---
 examples/vhost/main.c | 44 +--
 examples/vhost/main.h |  2 +-
 lib/librte_vhost/rte_virtio_net.h |  2 +-
 lib/librte_vhost/vhost-net.h  |  6 ++--
 lib/librte_vhost/vhost_cuse/vhost-net-cdev.c  | 36 +++---
 lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |  6 ++--
 lib/librte_vhost/vhost_rxtx.c | 22 +++---
 lib/librte_vhost/vhost_user/vhost-net-user.c  | 16 +-
 lib/librte_vhost/vhost_user/virtio-net-user.c |  2 +-
 lib/librte_vhost/virtio-net.c | 30 +-
 10 files changed, 83 insertions(+), 83 deletions(-)

diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index beb56eb..cb04585 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -716,7 +716,7 @@ link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
if (find_vhost_dev(_hdr->s_addr)) {
RTE_LOG(ERR, VHOST_DATA,
"(%d) device is using a registered MAC!\n",
-   vdev->device_fh);
+   vdev->vid);
return -1;
}

@@ -724,12 +724,12 @@ link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];

/* vlan_tag currently uses the device_id. */
-   vdev->vlan_tag = vlan_tags[vdev->device_fh];
+   vdev->vlan_tag = vlan_tags[vdev->vid];

/* Print out VMDQ registration info. */
RTE_LOG(INFO, VHOST_DATA,
"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d 
registered\n",
-   vdev->device_fh,
+   vdev->vid,
vdev->mac_address.addr_bytes[0], 
vdev->mac_address.addr_bytes[1],
vdev->mac_address.addr_bytes[2], 
vdev->mac_address.addr_bytes[3],
vdev->mac_address.addr_bytes[4], 
vdev->mac_address.addr_bytes[5],
@@ -737,11 +737,11 @@ link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)

/* Register the MAC address. */
ret = rte_eth_dev_mac_addr_add(ports[0], >mac_address,
-   (uint32_t)vdev->device_fh + vmdq_pool_base);
+   (uint32_t)vdev->vid + vmdq_pool_base);
if (ret)
RTE_LOG(ERR, VHOST_DATA,
"(%d) failed to add device MAC address to VMDQ\n",
-   vdev->device_fh);
+   vdev->vid);

/* Enable stripping of the vlan tag as we handle routing. */
if (vlan_strip)
@@ -820,19 +820,19 @@ virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf 
*m)
if (!dst_vdev)
return -1;

-   if (vdev->device_fh == dst_vdev->device_fh) {
+   if (vdev->vid == dst_vdev->vid) {
RTE_LOG(DEBUG, VHOST_DATA,
"(%d) TX: src and dst MAC is same. Dropping packet.\n",
-   vdev->device_fh);
+   vdev->vid);
return 0;
}

RTE_LOG(DEBUG, VHOST_DATA,
-   "(%d) TX: MAC address is local\n", dst_vdev->device_fh);
+   "(%d) TX: MAC address is local\n", dst_vdev->vid);

if (unlikely(dst_vdev->remove)) {
RTE_LOG(DEBUG, VHOST_DATA,
-   "(%d) device is marked for removal\n", 
dst_vdev->device_fh);
+   "(%d) device is marked for removal\n", dst_vdev->vid);
return 0;
}

@@ -855,10 +855,10 @@ find_local_dest(struct vhost_dev *vdev, struct rte_mbuf 
*m,
if (!dst_vdev)
return 0;

-   if (vdev->device_fh == dst_vdev->device_fh) {
+   if (vdev->vid == dst_vdev->vid) {
RTE_LOG(DEBUG, VHOST_DATA,
"(%d) TX: src and dst MAC is same. Dropping packet.\n",
-   vdev->device_fh);
+   vdev->vid);
return -1;
}

@@ -868,11 +868,11 @@ find_local_dest(struct vhost_dev *vdev, struct rte_mbuf 
*m,
 * the packet length by plus it.
 */
*offset  = VLAN_HLEN;
-   *vlan_tag = vlan_tags[vdev->device_fh];
+   *vlan_tag = vlan_tags[vdev->vid];

RTE_LOG(DEBUG, VHOST_DATA,
"(%d) TX: pkt to local VM device id: (%d), 

[dpdk-dev] [PATCH v2 04/19] examples/vhost: make a copy of virtio device id

2016-05-12 Thread Yuanhan Liu
Make a copy of virtio device id (device_fh) from the virtio_net struct,
so that we could have less dependency on the virtio_net struct.

Signed-off-by: Yuanhan Liu 
---
 examples/vhost/main.c | 59 ---
 examples/vhost/main.h |  1 +
 2 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index 6c7541a..beb56eb 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -708,7 +708,6 @@ static int
 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
 {
struct ether_hdr *pkt_hdr;
-   struct virtio_net *dev = vdev->dev;
int i, ret;

/* Learn MAC address of guest device from packet */
@@ -717,7 +716,7 @@ link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
if (find_vhost_dev(_hdr->s_addr)) {
RTE_LOG(ERR, VHOST_DATA,
"(%d) device is using a registered MAC!\n",
-   dev->device_fh);
+   vdev->device_fh);
return -1;
}

@@ -725,12 +724,12 @@ link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];

/* vlan_tag currently uses the device_id. */
-   vdev->vlan_tag = vlan_tags[dev->device_fh];
+   vdev->vlan_tag = vlan_tags[vdev->device_fh];

/* Print out VMDQ registration info. */
RTE_LOG(INFO, VHOST_DATA,
"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d 
registered\n",
-   dev->device_fh,
+   vdev->device_fh,
vdev->mac_address.addr_bytes[0], 
vdev->mac_address.addr_bytes[1],
vdev->mac_address.addr_bytes[2], 
vdev->mac_address.addr_bytes[3],
vdev->mac_address.addr_bytes[4], 
vdev->mac_address.addr_bytes[5],
@@ -738,11 +737,11 @@ link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)

/* Register the MAC address. */
ret = rte_eth_dev_mac_addr_add(ports[0], >mac_address,
-   (uint32_t)dev->device_fh + vmdq_pool_base);
+   (uint32_t)vdev->device_fh + vmdq_pool_base);
if (ret)
RTE_LOG(ERR, VHOST_DATA,
"(%d) failed to add device MAC address to VMDQ\n",
-   dev->device_fh);
+   vdev->device_fh);

/* Enable stripping of the vlan tag as we handle routing. */
if (vlan_strip)
@@ -814,7 +813,6 @@ virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
 {
struct ether_hdr *pkt_hdr;
struct vhost_dev *dst_vdev;
-   int fh;

pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);

@@ -822,19 +820,19 @@ virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf 
*m)
if (!dst_vdev)
return -1;

-   fh = dst_vdev->dev->device_fh;
-   if (fh == vdev->dev->device_fh) {
+   if (vdev->device_fh == dst_vdev->device_fh) {
RTE_LOG(DEBUG, VHOST_DATA,
"(%d) TX: src and dst MAC is same. Dropping packet.\n",
-   fh);
+   vdev->device_fh);
return 0;
}

-   RTE_LOG(DEBUG, VHOST_DATA, "(%d) TX: MAC address is local\n", fh);
+   RTE_LOG(DEBUG, VHOST_DATA,
+   "(%d) TX: MAC address is local\n", dst_vdev->device_fh);

if (unlikely(dst_vdev->remove)) {
RTE_LOG(DEBUG, VHOST_DATA,
-   "(%d) device is marked for removal\n", fh);
+   "(%d) device is marked for removal\n", 
dst_vdev->device_fh);
return 0;
}

@@ -847,7 +845,7 @@ virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
  * and get its vlan tag, and offset if it is.
  */
 static inline int __attribute__((always_inline))
-find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
+find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
uint32_t *offset, uint16_t *vlan_tag)
 {
struct vhost_dev *dst_vdev;
@@ -857,10 +855,10 @@ find_local_dest(struct virtio_net *dev, struct rte_mbuf 
*m,
if (!dst_vdev)
return 0;

-   if (dst_vdev->dev->device_fh == dev->device_fh) {
+   if (vdev->device_fh == dst_vdev->device_fh) {
RTE_LOG(DEBUG, VHOST_DATA,
"(%d) TX: src and dst MAC is same. Dropping packet.\n",
-   dst_vdev->dev->device_fh);
+   vdev->device_fh);
return -1;
}

@@ -870,11 +868,11 @@ find_local_dest(struct virtio_net *dev, struct rte_mbuf 
*m,
 * the packet length by plus it.
 */
*offset  = VLAN_HLEN;
-   *vlan_tag = vlan_tags[(uint16_t)dst_vdev->dev->device_fh];
+   *vlan_tag = vlan_tags[vdev->device_fh];

RTE_LOG(DEBUG, VHOST_DATA,
-   "(%d) TX: pkt to local VM device id (%d) vlan tag: %u.\n",
- 

[dpdk-dev] [PATCH v2 03/19] vhost: declare device fh as int

2016-05-12 Thread Yuanhan Liu
Firstly, "int" would be big enough: we don't need 64 bit to represent
a virtio-net device id. Secondly, this could let us avoid the ugly
"%" PRIu64 ".." stuff.

And since ctx.fh is derived from device_fh, declare it as int, too.

Signed-off-by: Yuanhan Liu 
---
 examples/vhost/main.c | 45 ++-
 lib/librte_vhost/rte_virtio_net.h |  2 +-
 lib/librte_vhost/vhost-net.h  |  8 ++---
 lib/librte_vhost/vhost_cuse/vhost-net-cdev.c  | 34 ++--
 lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |  7 ++---
 lib/librte_vhost/vhost_rxtx.c | 34 +---
 lib/librte_vhost/vhost_user/vhost-net-user.c  |  2 +-
 lib/librte_vhost/vhost_user/virtio-net-user.c |  2 +-
 lib/librte_vhost/virtio-net.c | 21 ++---
 9 files changed, 75 insertions(+), 80 deletions(-)

diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index ffc7209..6c7541a 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -716,7 +716,7 @@ link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)

if (find_vhost_dev(_hdr->s_addr)) {
RTE_LOG(ERR, VHOST_DATA,
-   "Device (%" PRIu64 ") is using a registered MAC!\n",
+   "(%d) device is using a registered MAC!\n",
dev->device_fh);
return -1;
}
@@ -728,7 +728,8 @@ link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
vdev->vlan_tag = vlan_tags[dev->device_fh];

/* Print out VMDQ registration info. */
-   RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS 
%02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
+   RTE_LOG(INFO, VHOST_DATA,
+   "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d 
registered\n",
dev->device_fh,
vdev->mac_address.addr_bytes[0], 
vdev->mac_address.addr_bytes[1],
vdev->mac_address.addr_bytes[2], 
vdev->mac_address.addr_bytes[3],
@@ -739,8 +740,9 @@ link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
ret = rte_eth_dev_mac_addr_add(ports[0], >mac_address,
(uint32_t)dev->device_fh + vmdq_pool_base);
if (ret)
-   RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC 
address to VMDQ\n",
-   dev->device_fh);
+   RTE_LOG(ERR, VHOST_DATA,
+   "(%d) failed to add device MAC address to VMDQ\n",
+   dev->device_fh);

/* Enable stripping of the vlan tag as we handle routing. */
if (vlan_strip)
@@ -812,7 +814,7 @@ virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
 {
struct ether_hdr *pkt_hdr;
struct vhost_dev *dst_vdev;
-   uint64_t fh;
+   int fh;

pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);

@@ -823,17 +825,16 @@ virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf 
*m)
fh = dst_vdev->dev->device_fh;
if (fh == vdev->dev->device_fh) {
RTE_LOG(DEBUG, VHOST_DATA,
-   "(%" PRIu64 ") TX: src and dst MAC is same. "
-   "Dropping packet.\n", fh);
+   "(%d) TX: src and dst MAC is same. Dropping packet.\n",
+   fh);
return 0;
}

-   RTE_LOG(DEBUG, VHOST_DATA,
-   "(%" PRIu64 ") TX: MAC address is local\n", fh);
+   RTE_LOG(DEBUG, VHOST_DATA, "(%d) TX: MAC address is local\n", fh);

if (unlikely(dst_vdev->remove)) {
-   RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
-   "Device is marked for removal\n", fh);
+   RTE_LOG(DEBUG, VHOST_DATA,
+   "(%d) device is marked for removal\n", fh);
return 0;
}

@@ -858,8 +859,8 @@ find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,

if (dst_vdev->dev->device_fh == dev->device_fh) {
RTE_LOG(DEBUG, VHOST_DATA,
-   "(%" PRIu64 ") TX: src and dst MAC is same. "
-   " Dropping packet.\n", dst_vdev->dev->device_fh);
+   "(%d) TX: src and dst MAC is same. Dropping packet.\n",
+   dst_vdev->dev->device_fh);
return -1;
}

@@ -872,8 +873,7 @@ find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
*vlan_tag = vlan_tags[(uint16_t)dst_vdev->dev->device_fh];

RTE_LOG(DEBUG, VHOST_DATA,
-   "(%" PRIu64 ") TX: pkt to local VM device id: (%" PRIu64 ") "
-   "vlan tag: %u.\n",
+   "(%d) TX: pkt to local VM device id (%d) vlan tag: %u.\n",
dev->device_fh, dst_vdev->dev->device_fh, *vlan_tag);

return 0;
@@ -964,8 +964,8 @@ virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, 
uint16_t vlan_tag)
}
}

-   RTE_LOG(DEBUG, 

[dpdk-dev] [PATCH v2 02/19] vhost: set/reset dev flags internally

2016-05-12 Thread Yuanhan Liu
It does not make sense to ask the application to set/unset the flag
VIRTIO_DEV_RUNNING (that used internal only) at new_device()/
destroy_device() callback.

Instead, it should be set after new_device() succeeds and reset before
destroy_device() is invoked inside vhost lib. This patch fixes it.

Signed-off-by: Yuanhan Liu 
---
 drivers/net/vhost/rte_eth_vhost.c |  2 --
 examples/vhost/main.c |  3 ---
 lib/librte_vhost/vhost_user/virtio-net-user.c | 11 +++
 lib/librte_vhost/virtio-net.c | 21 ++---
 4 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/drivers/net/vhost/rte_eth_vhost.c 
b/drivers/net/vhost/rte_eth_vhost.c
index 310cbef..63538c1 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -278,7 +278,6 @@ new_device(struct virtio_net *dev)
for (i = 0; i < dev->virt_qp_nb * VIRTIO_QNUM; i++)
rte_vhost_enable_guest_notification(dev, i, 0);

-   dev->flags |= VIRTIO_DEV_RUNNING;
dev->priv = eth_dev;
eth_dev->data->dev_link.link_status = ETH_LINK_UP;

@@ -341,7 +340,6 @@ destroy_device(volatile struct virtio_net *dev)
eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;

dev->priv = NULL;
-   dev->flags &= ~VIRTIO_DEV_RUNNING;

for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
vq = eth_dev->data->rx_queues[i];
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index 32a79be..ffc7209 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -1179,8 +1179,6 @@ destroy_device (volatile struct virtio_net *dev)
struct vhost_dev *vdev;
int lcore;

-   dev->flags &= ~VIRTIO_DEV_RUNNING;
-
vdev = (struct vhost_dev *)dev->priv;
/*set the remove flag. */
vdev->remove = 1;
@@ -1257,7 +1255,6 @@ new_device (struct virtio_net *dev)
/* Disable notifications. */
rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
-   dev->flags |= VIRTIO_DEV_RUNNING;

RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data 
core %d\n", dev->device_fh, vdev->coreid);

diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c 
b/lib/librte_vhost/vhost_user/virtio-net-user.c
index f5248bc..e775e45 100644
--- a/lib/librte_vhost/vhost_user/virtio-net-user.c
+++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
@@ -115,8 +115,10 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct 
VhostUserMsg *pmsg)
return -1;

/* Remove from the data plane. */
-   if (dev->flags & VIRTIO_DEV_RUNNING)
+   if (dev->flags & VIRTIO_DEV_RUNNING) {
+   dev->flags &= ~VIRTIO_DEV_RUNNING;
notify_ops->destroy_device(dev);
+   }

if (dev->mem) {
free_mem_region(dev);
@@ -286,9 +288,10 @@ user_set_vring_kick(struct vhost_device_ctx ctx, struct 
VhostUserMsg *pmsg)
"vring kick idx:%d file:%d\n", file.index, file.fd);
vhost_set_vring_kick(ctx, );

-   if (virtio_is_ready(dev) &&
-   !(dev->flags & VIRTIO_DEV_RUNNING))
-   notify_ops->new_device(dev);
+   if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
+   if (notify_ops->new_device(dev) == 0)
+   dev->flags |= VIRTIO_DEV_RUNNING;
+   }
 }

 /*
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 1a6259b..c45ed1c 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -296,8 +296,10 @@ vhost_destroy_device(struct vhost_device_ctx ctx)
if (dev == NULL)
return;

-   if (dev->flags & VIRTIO_DEV_RUNNING)
+   if (dev->flags & VIRTIO_DEV_RUNNING) {
+   dev->flags &= ~VIRTIO_DEV_RUNNING;
notify_ops->destroy_device(dev);
+   }

cleanup_device(dev, 1);
free_device(dev);
@@ -353,8 +355,10 @@ vhost_reset_owner(struct vhost_device_ctx ctx)
if (dev == NULL)
return -1;

-   if (dev->flags & VIRTIO_DEV_RUNNING)
+   if (dev->flags & VIRTIO_DEV_RUNNING) {
+   dev->flags &= ~VIRTIO_DEV_RUNNING;
notify_ops->destroy_device(dev);
+   }

cleanup_device(dev, 0);
reset_device(dev);
@@ -719,12 +723,15 @@ vhost_set_backend(struct vhost_device_ctx ctx, struct 
vhost_vring_file *file)
if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
if (dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED &&
dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED) {
-   return notify_ops->new_device(dev);
+   if (notify_ops->new_device(dev) < 0)
+   return -1;
+   dev->flags |= VIRTIO_DEV_RUNNING;
}
-   /* Otherwise we 

[dpdk-dev] [PATCH v2 01/19] vhost: declare backend with int type

2016-05-12 Thread Yuanhan Liu
It's an fd; so define it as "int", which could also save the unncessary
(int) case.

Signed-off-by: Yuanhan Liu 
---
 lib/librte_vhost/rte_virtio_net.h | 2 +-
 lib/librte_vhost/virtio-net.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/librte_vhost/rte_virtio_net.h 
b/lib/librte_vhost/rte_virtio_net.h
index 600b20b..4d25f79 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -85,7 +85,7 @@ struct vhost_virtqueue {
struct vring_avail  *avail; /**< Virtqueue 
available ring. */
struct vring_used   *used;  /**< Virtqueue used 
ring. */
uint32_tsize;   /**< Size of descriptor 
ring. */
-   uint32_tbackend;/**< Backend value to 
determine if device should started/stopped. */
+   int backend;/**< Backend value to 
determine if device should started/stopped. */
uint16_tvhost_hlen; /**< Vhost header 
length (varies depending on RX merge buffers. */
volatile uint16_t   last_used_idx;  /**< Last index used on 
the available ring */
volatile uint16_t   last_used_idx_res;  /**< Used for multiple 
devices reserving buffers. */
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index f4695af..1a6259b 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -717,8 +717,8 @@ vhost_set_backend(struct vhost_device_ctx ctx, struct 
vhost_vring_file *file)
 * we add the device.
 */
if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
-   if (((int)dev->virtqueue[VIRTIO_TXQ]->backend != 
VIRTIO_DEV_STOPPED) &&
-   ((int)dev->virtqueue[VIRTIO_RXQ]->backend != 
VIRTIO_DEV_STOPPED)) {
+   if (dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED &&
+   dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED) {
return notify_ops->new_device(dev);
}
/* Otherwise we remove it. */
-- 
1.9.0



[dpdk-dev] [PATCH v2 00/19] vhost ABI/API refactoring

2016-05-12 Thread Yuanhan Liu
v2: - exported ifname as well to fix a vhost-pmd issue reported
  by Rich
- separated the big patch that introduces several new APIs
  into some small patches.
- updated release note
- updated version.map

NOTE: I created a branch at dpdk.org [0] for more conveinient
testing:

[0]: git://dpdk.org/next/dpdk-next-virtio for-testing


Every time we introduce a new feature to vhost, we are likely
to break ABI. Moreover, some cleanups (such as the one from Ilya
to remove vec_buf from vhost_virtqueue struct) also break ABI.

This patch set is meant to resolve above issue ultimately, by
hiding virtio_net structure (as well as few others) internaly,
and export the virtio_net dev strut to applications by a number,
vid, like the way kernel exposes an fd to user space.

Back to the patch set, the first part of this set makes some
changes to vhost example, vhost-pmd and vhost, bit by bit, to
remove the dependence to "virtio_net" struct. And then do the
final change to make the current APIs to adapt to using "vid".

After that, "vrtio_net_device_ops" is the only left open struct
that an application can acces, therefore, it's the only place
that might introduce potential ABI breakage in future for
extension. Hence, I made few more (5) space reservation, to
make sure we will not break ABI for a long time, and hopefuly,
forever.

The last bit of this patch set is some cleanups, including the
one from Ilya.

Note that this refactoring breaks the tep_termination example.
Well, it's just another copy of the original messy vhost example,
and I have no interest to cleanup it again. Therefore, I might
consider to remove that example later, and add the vxlan bits
into vhost example.

Thanks.

--yliu

---
Ilya Maximets (1):
  vhost: make buf vector for scatter Rx local

Yuanhan Liu (18):
  vhost: declare backend with int type
  vhost: set/reset dev flags internally
  vhost: declare device fh as int
  examples/vhost: make a copy of virtio device id
  vhost: rename device fh to vid
  vhost: get device by vid only
  vhost: move vhost device ctx to cuse
  vhost: introduce new API to export numa node
  vhost: introduce new API to export number of queues
  vhost: introduce new API to export ifname
  vhost: introduce new API to export queue free entries
  vhost: remove dependency on priv field
  vhost: export vid as the only interface to applications
  vhost: hide internal structs/macros/functions
  vhost: remove unnecessary fields
  vhost: remove virtio-net.h
  vhost: reserve few more space for future extension
  vhost: per device virtio net header len

 doc/guides/rel_notes/release_16_07.rst|   9 +
 drivers/net/vhost/rte_eth_vhost.c |  79 -
 examples/vhost/main.c | 124 +++---
 examples/vhost/main.h |   1 +
 lib/librte_vhost/rte_vhost_version.map|  10 ++
 lib/librte_vhost/rte_virtio_net.h | 223 +++--
 lib/librte_vhost/vhost-net.h  | 201 ++
 lib/librte_vhost/vhost_cuse/vhost-net-cdev.c  |  83 +-
 lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |  30 ++--
 lib/librte_vhost/vhost_cuse/virtio-net-cdev.h |  12 +-
 lib/librte_vhost/vhost_rxtx.c | 133 ---
 lib/librte_vhost/vhost_user/vhost-net-user.c  |  53 +++---
 lib/librte_vhost/vhost_user/vhost-net-user.h  |   2 +
 lib/librte_vhost/vhost_user/virtio-net-user.c |  64 +++
 lib/librte_vhost/vhost_user/virtio-net-user.h |  18 +-
 lib/librte_vhost/virtio-net.c | 229 +-
 lib/librte_vhost/virtio-net.h |  43 -
 17 files changed, 702 insertions(+), 612 deletions(-)
 delete mode 100644 lib/librte_vhost/virtio-net.h

-- 
1.9.0



[dpdk-dev] [PATCH v4 6/8] virtio-user: add new virtual pci driver for virtio

2016-05-12 Thread Yuanhan Liu
On Fri, May 13, 2016 at 09:54:33AM +0800, Tan, Jianfeng wrote:
> 
> So, I'd suggest something like following:
> 
> if (is_vdev(..)) {
> 
> 
> The blocker issue of your suggestion is that we have no such condition.
> 
> Previously, I use dev_type, but as David's comment said:

That's not the only option. There should be others, for example,
checking the existence of virtio_user_device. Or even, you could
add a new flag inside virtio hw while initiating your vdev.

> May I ask how many more such handling are needed, excluding the tx queue
> header desc setup? And as stated, in generic, yes, we should try that.
> 
> 
> Those which need special handling:
> (1) vq->vq_ring_mem: it is set but never used, so it's out of question.
> (2) vq->virtio_net_hdr_mem and vring_hdr_desc_init

vring_hdr_desc_init is common.

> (3) vq->offset
> 
> Just (2) and (3) so far. And the question is quite clear: where to put these
> two special handling.

Apparently, you can't put it into the queue_setup(). And I still think
my proposal works great here.

--yliu


[dpdk-dev] [RFC] mbuf: new flag when vlan is stripped

2016-05-12 Thread John Daley (johndale)
Hi Olivier,

> ...
> This is a draft patch that implements what was previously discussed, except
> the packet_type, which does not exist for vlan today (and I think it is not
> mandatory for now, let's do it in another patch).
> 
> After doing this patch, it appeared that ixgbe was the only driver that had a
> different behavior for the PKT_RX_VLAN_PKT flag. An alternative to this
> patch would be to only change the behavior of the ixgbe driver, and just
> document better document the PKT_RX_VLAN_PKT flags in rte_mbuf.h
> without adding new flags. I think this is a better option.
> 
> Comments are welcome.
>
There are applications depending on the current behavior of PKT_RX_VLAN_PKT as 
confusing as it may be.  I know of one that has VLAN stripping disabled and 
uses the flag to determine if the packet delivered to the app has a VLAN tag. 
This is actually how the flag behaves for ixgbe, and they patched enic and 
maybe other drivers to act accordingly. To avoid breaking the app (and any 
others like it), I think we should keep the flag behavior the same and add the 
new flag PKT_RX_VLAN_STRIPPED .

We should follow on with the new packet type since it enables a nice 
performance improvement by not forcing apps to break open the packet just to 
see if there is a VLAN tag.

Thanks,
john



[dpdk-dev] mbuff rearm_data aligmenet issue on non x86

2016-05-12 Thread Jerin Jacob
On Thu, May 12, 2016 at 01:14:34PM +, Ananyev, Konstantin wrote:
> > 
> > On Thu, May 12, 2016 at 10:07:09AM +, Ananyev, Konstantin wrote:
> > > Hi Jerrin,
> > >
> > > >
> > > > Hi All,
> > > >
> > > > I would like align mbuff rearm_data field to 8 byte aligned so that
> > > > write to mbuf->rearm_data with uint64_t* will be naturally aligned.
> > > > I am not sure about IA but some other architecture/implementation has 
> > > > overhead
> > > > in non-naturally aligned stores.
> > > >
> > > > Proposed patch is something like this below, But open for any change to
> > > > make fit for all other architectures/platform.
> > > >
> > > > Any thoughts ?
> > > >
> > > > ? [master] [dpdk-master] $ git diff
> > > > diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> > > > index 529debb..5a917d0 100644
> > > > --- a/lib/librte_mbuf/rte_mbuf.h
> > > > +++ b/lib/librte_mbuf/rte_mbuf.h
> > > > @@ -733,10 +733,8 @@ struct rte_mbuf {
> > > > void *buf_addr;   /**< Virtual address of segment
> > > > buffer. */
> > > > phys_addr_t buf_physaddr; /**< Physical address of segment
> > > > buffer. */
> > > >
> > > > -   uint16_t buf_len; /**< Length of segment buffer. */
> > > > -
> > >
> > >
> > > There is no need to move buf_len itself, I think.
> > > Just move rearm_data marker prior to buf_len is enough.
> > > Though how do you suggest to deal with the fact, that right now we blindly
> > > update the whole 64bits pointed by rearm_data:
> > >
> > > drivers/net/ixgbe/ixgbe_rxtx_vec.c:
> > >   /*
> > >  * Flush mbuf with pkt template.
> > >  * Data to be rearmed is 6 bytes long.
> > >  * Though, RX will overwrite ol_flags that are coming next
> > >  * anyway. So overwrite whole 8 bytes with one load:
> > >  * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
> > >  */
> > > p0 = (uintptr_t)>rearm_data;
> > > *(uint64_t *)p0 = rxq->mbuf_initializer;
> > >
> > > ?
> > >
> > > If buf_len will be inside these 64bits, we can't do it anymore.
> > >
> > > Are you suggesting something like:
> > >
> > > uint64_t *p0, v0;
> > >
> > > p0 = >rearm_data;
> > > v0 = *p0 & REARM_MASK;
> > > *p0 = v0 | rxq->mbuf_initializer;
> > > ?
> > 
> > Due to unaligned rearm_data issue, In ThunderX platform, we need to write
> > multiple half word of aligned stores(so masking was better us).
> 
> Ok, so what would be the gain on ARM if you'll make that change?

~4 cpu cycles per packet.Again it may not be ARM architecture specific
as ARM architecture does not define  instruction latency so it is more of a
implementation specific data.

> Again, what would be the drop (if any) on IA?
> 
> > But I think, if we can put 16bit hole between port and ol_flags then
> > we may not need the masking stuff in ixgbe. Right?
> 
> You mean move buf_len somewhere else (end of cacheline0) and 
> introduce a 2B hole between port and ol_flags, right?

Yes

> Yep, that probably wouldn't have any performance impact.

I will try two options and send a patch which don't have any
performance impact on IA.

> 
> > 
> > OR
> > 
> > Even better, if we can fill in a uint16_t variable which will replaced
> > later in the flow like "data_len"?
> 
> data_len is grouped  with rx_descriptor_fields1 on purpose -
> so we can update packet_type,pkt_len, data_len,vlan_tci,rss with one 16B 
> write.

OK

> 
> Konstantin
> 
> > and move buf_len at end the first  cache line? 
> >or any other thoughts to fix unaligned rearm_data issue?
> > 
> > Jerin
> > 
> > 
> > 
> > >
> > > If so I wonder what would be the performance impact of that change.
> > > Konstantin
> > >
> > >
> > > > /* next 6 bytes are initialised on RX descriptor rearm */
> > > > -   MARKER8 rearm_data;
> > > > +   MARKER64 rearm_data;
> > > > uint16_t data_off;
> > > >
> > > > /**
> > > > @@ -754,6 +752,7 @@ struct rte_mbuf {
> > > > uint8_t nb_segs;  /**< Number of segments. */
> > > > uint8_t port; /**< Input port. */
> > > >
> > > > +   uint16_t buf_len; /**< Length of segment buffer. */
> > > > uint64_t ol_flags;/**< Offload features. */
> > > >
> > > > /* remaining bytes are set on RX when pulling packet from
> > > >  * descriptor
> > > >
> > > > /Jerin


[dpdk-dev] [PATCH v4 6/8] virtio-user: add new virtual pci driver for virtio

2016-05-12 Thread Michael S. Tsirkin
On Thu, May 12, 2016 at 03:08:05PM +0800, Tan, Jianfeng wrote:
> (2) It's more aligned to previous logic to hide the detail to differentiate
> modern/legacy device.

Why is there a need to support legacy interfaces at all?  It's a container
so if it's in use one can be reasonably sure you have a new kernel.

-- 
MST


[dpdk-dev] [RFC PATCH v2 1/3] rte: change xstats to use integer keys

2016-05-12 Thread Thomas Monjalon
2016-05-09 13:59, David Harton:
> >  }
> > 
> > -
> 
> Not sure how the community feels about white-space only changes.
> Just mentioning in case some folks get excited about it.  One here and a few 
> below.

It's a trivial cleanup. No problem I think.


[dpdk-dev] [PATCH v2 09/11] eal/pci: replace SYSFS_PCI_DEVICES with pci_get_sysfs_path()

2016-05-12 Thread Thomas Monjalon
2016-05-12 17:46, Jan Viktorin:
> On Thu, 12 May 2016 17:41:22 +0200
> Thomas Monjalon  wrote:
> > 2016-05-10 20:13, Jan Viktorin:
> > > + orig = pci_get_sysfs_path();
> > > + ret = setenv("SYSFS_PCI_DEVICES", "My Documents", 1);  
> > 
> > Oh no!
> 
> Not sure about your reaction...

MS reference... ;)

> > 
> > > + TEST_ASSERT_SUCCESS(ret, "Failed setenv to My Documents");
> > > +
> > > + path = pci_get_sysfs_path();
> > > + TEST_ASSERT(strcmp(orig, path),
> > > + "orig must be different from path: "  
> > 
> > I missed something here. Why different?
> 
> Because I've set it to "My Documents" and want to be sure that the
> pci_get_sysfs_path() returns the new path instead of the default
> one.
> 
> Perhaps, !strcmp(path, "My Documents") would be better here...

No, just rethink the variable names maybe.


[dpdk-dev] [PATCH v2 06/11] app/test: use linked list to store PCI drivers

2016-05-12 Thread Thomas Monjalon
2016-05-12 17:53, Jan Viktorin:
> On Thu, 12 May 2016 17:31:28 +0200
> Thomas Monjalon  wrote:
> 
> > 2016-05-10 20:13, Jan Viktorin:
> > > The test unregisters all real drivers before starting into an array. This
> > > inflexiable as we can use a linked list for this purpose.  
> > 
> > I don't understand this. Maybe some words are missing.
> 
> Better?
> 
> The test unregisters all real drivers before starting (stored into an array).
> This is inflexiable (due to its fixed size) and we can use a linked list for
> this purpose.  

Better with a past tense? "The test was unregistering..."

> > > +/* real drivers (not used for testing) */  
> > 
> > What do mean by "not used for testing"?
> 
> The test now avoids the DPDK builtin drivers. It only considers its
> internal fake drivers my_driver and my_driver2. So the real drivers
> are temporarily store into the real_pci_driver_list and returned back
> after the test finishes.

Maybe adding "Save" or "Backup" would make it clear.

> It is the linked list mentioned in the commit log. It replaces the
> original fixed-size array.
> 
> (For drivers, it does not matter that much. But for devices, I think,
> it is not a good practice to consider them in autotests. Every PC
> where you execute the tests have different set of PCI devices.)

Yes


[dpdk-dev] [PATCH v2 06/11] app/test: use linked list to store PCI drivers

2016-05-12 Thread Jan Viktorin
On Thu, 12 May 2016 17:31:28 +0200
Thomas Monjalon  wrote:

> 2016-05-10 20:13, Jan Viktorin:
> > The test unregisters all real drivers before starting into an array. This
> > inflexiable as we can use a linked list for this purpose.  
> 
> I don't understand this. Maybe some words are missing.

Better?

The test unregisters all real drivers before starting (stored into an array).
This is inflexiable (due to its fixed size) and we can use a linked list for
this purpose.  

> 
> > +/* real drivers (not used for testing) */  
> 
> What do mean by "not used for testing"?

The test now avoids the DPDK builtin drivers. It only considers its
internal fake drivers my_driver and my_driver2. So the real drivers
are temporarily store into the real_pci_driver_list and returned back
after the test finishes.

It is the linked list mentioned in the commit log. It replaces the
original fixed-size array.

(For drivers, it does not matter that much. But for devices, I think,
it is not a good practice to consider them in autotests. Every PC
where you execute the tests have different set of PCI devices.)

> 
> > +struct pci_driver_list real_pci_driver_list =
> > +   TAILQ_HEAD_INITIALIZER(real_pci_driver_list);  
> 



-- 
   Jan Viktorin  E-mail: Viktorin at RehiveTech.com
   System Architect  Web:www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic


[dpdk-dev] [PATCHv3 1/2] config/armv8a: disable igb_uio

2016-05-12 Thread Jianbo Liu
On 12 May 2016 at 16:57, Santosh Shukla
 wrote:
> On Thu, May 12, 2016 at 01:54:13PM +0800, Jianbo Liu wrote:
>> On 12 May 2016 at 13:06, Santosh Shukla
>>  wrote:
>> > On Thu, May 12, 2016 at 11:42:26AM +0800, Jianbo Liu wrote:
>> >> On 12 May 2016 at 11:17, Santosh Shukla
>> >>  wrote:
>> >> > On Thu, May 12, 2016 at 10:01:05AM +0800, Jianbo Liu wrote:
>> >> >> On 12 May 2016 at 02:25, Stephen Hemminger > >> >> networkplumber.org> wrote:
>> >> >> > On Wed, 11 May 2016 22:32:16 +0530
>> >> >> > Jerin Jacob  wrote:
>> >> >> >
>> >> >> >> On Wed, May 11, 2016 at 08:22:59AM -0700, Stephen Hemminger wrote:
>> >> >> >> > On Wed, 11 May 2016 19:17:58 +0530
>> >> >> >> > Hemant Agrawal  wrote:
>> >> >> >> >
>> >> >> >> > > IGB_UIO not supported for arm64 arch in kernel so disable.
>> >> >> >> > >
>> >> >> >> > > Signed-off-by: Hemant Agrawal 
>> >> >> >> > > Reviewed-by: Santosh Shukla > >> >> >> > > caviumnetworks.com>
>> >> >> >> >
>> >> >> >> > Really, I have use IGB_UIO on ARM64
>> >> >> >>
>> >> >> >> May I know what is the technical use case for igb_uio on arm64
>> >> >> >> which cannot be addressed through vfio or vfioionommu.
>> >> >> >
>> >> >> > I was running on older kernel which did not support vfioionommu mode.
>> >> >>
>> >> >> As I said, most of DPDK developers are not kernel developers. They may
>> >> >> have their own kernel tree, and couldn't like to upgrade to latest
>> >> >> kernel.
>> >> >> They can choose to use or not use igb_uio when binding the driver. But
>> >> >> blindly disabling it in the base config seems unreasonable.
>> >> >
>> >> > if user keeping his own kernel so they could also keep IGB_UIO=y in 
>> >> > their local
>> >> Most likely they don't have local dpdk tree. They write their own
>> >> applications, complie and link to dpdk lib, then done.
>> >>
>> >> > dpdk tree. Why are you imposing user-x custome depedancy on upstream 
>> >> > dpdk base
>> >> Customer requiremnts is important. I want they can choose the way they 
>> >> like.
>> >>
>> >
>> > so you choose to keep igb_uio option, provided arch doesn't support?
>> > new user did reported issues with igb_uio for arm64, refer this thread 
>> > [1], as
>> > well hemanth too faced issues. we want to avoid that.
>> >
>> > If customer maintaing out-of-tree kernel then he can also switch to 
>> > vfio-way.
>> > isn;t it?
>> >
>> >> > config. Is it not enough for explanation that - Base config ie.. armv8 
>> >> > doesn;t
>> >> > support pci mmap, so igb_uio is n/a. New user wont able to build/run 
>> >> > dpdk/arm64
>> >> > in igb_uio-way, He'll prefer to use upstream stuff. I think, you are 
>> >> > not making
>> >> You are wrong, he can build dpdk. If he like to use upstream without
>> >> patching, he can use vfio.
>> >
>> > I disagree, we want to avoid [1] for new user.
>> >
>> >> But you can't ignore the need from old user which is more comfortable
>> >> with older kernel.
>> >>
>> > arm/arm64 dpdk support recently added and I am guessing, most likely 
>> > customer
>> > using near latest kernel, switching to vfio won't be so difficult.
>> >
>> > Or can you take up responsibility of upstreaming pci mmap patch, then we 
>> > don't
>> > need this patch.
>> >
>> > [1] http://dpdk.org/ml/archives/dev/2016-January/031313.html
>>
>> Can you read carefully about the guide at
>> http://dpdk.org/doc/guides/linux_gsg/build_dpdk.html? It says to use
>> uio_pci_generic, igb_uio or vfio-pci.
>
> *** applicable and works for x86 only, not for arm64: because pci mmap support
> not present for arm64, in that case we should update the doc.
>
>> Could it be possible that the user in that thread has already read and
>> tried them all and found that he can't enable vifo with his kernel,
>> and igb_uio is the easy way for him and asked for help from community?
>> If so, we have no choice but keeping igb_uio enabled.
>
> By then vfionoiommu support was wip progress in dpdk/linux. but now it merged
> and it works. So no need to retain igb_uio in base config for which to work -
> user need to use mmap patch at linux side.

We can't decide which kernel user will use.

>
> Or can you maintain out-of-tree pci mmap patch/ kerne source and make it
> explicit somewhere in dpdk build doc that - if user want igb_uio way then
> use kernel/mmap patch from x location.

The patch is in the kernel maillist, and user google it.
And isn't funny to ask someone to do something again and again (3
times) in this thread?

>
>> He use lsmod to show us the modules, most likely he know vifo-pci.
>>
>> Below are the details on modules, hugepages and device binding.
>> root at arm64:~# lsmod
>> Module  Size  Used by
>> rte_kni   292795  0
>> igb_uio 4338  0
>> ixgbe 184456  0


[dpdk-dev] mbuff rearm_data aligmenet issue on non x86

2016-05-12 Thread Jerin Jacob
On Thu, May 12, 2016 at 10:07:09AM +, Ananyev, Konstantin wrote:
> Hi Jerrin,
> 
> > 
> > Hi All,
> > 
> > I would like align mbuff rearm_data field to 8 byte aligned so that
> > write to mbuf->rearm_data with uint64_t* will be naturally aligned.
> > I am not sure about IA but some other architecture/implementation has 
> > overhead
> > in non-naturally aligned stores.
> > 
> > Proposed patch is something like this below, But open for any change to
> > make fit for all other architectures/platform.
> > 
> > Any thoughts ?
> > 
> > ? [master] [dpdk-master] $ git diff
> > diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> > index 529debb..5a917d0 100644
> > --- a/lib/librte_mbuf/rte_mbuf.h
> > +++ b/lib/librte_mbuf/rte_mbuf.h
> > @@ -733,10 +733,8 @@ struct rte_mbuf {
> > void *buf_addr;   /**< Virtual address of segment
> > buffer. */
> > phys_addr_t buf_physaddr; /**< Physical address of segment
> > buffer. */
> > 
> > -   uint16_t buf_len; /**< Length of segment buffer. */
> > -
> 
> 
> There is no need to move buf_len itself, I think.
> Just move rearm_data marker prior to buf_len is enough.
> Though how do you suggest to deal with the fact, that right now we blindly
> update the whole 64bits pointed by rearm_data:
> 
> drivers/net/ixgbe/ixgbe_rxtx_vec.c:
>   /*
>  * Flush mbuf with pkt template.
>  * Data to be rearmed is 6 bytes long.
>  * Though, RX will overwrite ol_flags that are coming next
>  * anyway. So overwrite whole 8 bytes with one load:
>  * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
>  */
> p0 = (uintptr_t)>rearm_data;
> *(uint64_t *)p0 = rxq->mbuf_initializer;
> 
> ?
> 
> If buf_len will be inside these 64bits, we can't do it anymore.
> 
> Are you suggesting something like:
> 
> uint64_t *p0, v0; 
> 
> p0 = >rearm_data;
> v0 = *p0 & REARM_MASK;
> *p0 = v0 | rxq->mbuf_initializer;
> ? 

Due to unaligned rearm_data issue, In ThunderX platform, we need to write
multiple half word of aligned stores(so masking was better us).

But I think, if we can put 16bit hole between port and ol_flags then
we may not need the masking stuff in ixgbe. Right?

OR

Even better, if we can fill in a uint16_t variable which will replaced
later in the flow like "data_len"? and move buf_len at end the first
cache line? or any other thoughts to fix unaligned rearm_data issue?

Jerin



> 
> If so I wonder what would be the performance impact of that change.
> Konstantin
> 
> 
> > /* next 6 bytes are initialised on RX descriptor rearm */
> > -   MARKER8 rearm_data;
> > +   MARKER64 rearm_data;
> > uint16_t data_off;
> > 
> > /**
> > @@ -754,6 +752,7 @@ struct rte_mbuf {
> > uint8_t nb_segs;  /**< Number of segments. */
> > uint8_t port; /**< Input port. */
> > 
> > +   uint16_t buf_len; /**< Length of segment buffer. */
> > uint64_t ol_flags;/**< Offload features. */
> > 
> > /* remaining bytes are set on RX when pulling packet from
> >  * descriptor
> > 
> > /Jerin


[dpdk-dev] [PATCH v2 09/11] eal/pci: replace SYSFS_PCI_DEVICES with pci_get_sysfs_path()

2016-05-12 Thread Jan Viktorin
On Thu, 12 May 2016 17:41:22 +0200
Thomas Monjalon  wrote:

> 2016-05-10 20:13, Jan Viktorin:
> > The SYSFS_PCI_DEVICES is a constant that makes the PCI testing difficult as
> > it points to an absolute path. We remove using this constant and introducing
> > a function pci_get_sysfs_path that gives the same value. However, the user 
> > can
> > pass a SYSFS_PCI_DEVICES env variable to override the path. It is now 
> > possible
> > to create a fake sysfs hierarchy for testing.  
> 
> Yeah!

:)

> 
> > +   orig = pci_get_sysfs_path();
> > +   ret = setenv("SYSFS_PCI_DEVICES", "My Documents", 1);  
> 
> Oh no!

Not sure about your reaction...

> 
> > +   TEST_ASSERT_SUCCESS(ret, "Failed setenv to My Documents");
> > +
> > +   path = pci_get_sysfs_path();
> > +   TEST_ASSERT(strcmp(orig, path),
> > +   "orig must be different from path: "  
> 
> I missed something here. Why different?

Because I've set it to "My Documents" and want to be sure that the
pci_get_sysfs_path() returns the new path instead of the default
one.

Perhaps, !strcmp(path, "My Documents") would be better here...

> 
> > +DPDK_16.07 {
> > +   global:
> > +
> > +   pci_get_sysfs_path;
> > +} DPDK_16.04;  
> 
> I don't know why but we are used to put a blank line after the last symbol.

Will fix.

> 



-- 
   Jan Viktorin  E-mail: Viktorin at RehiveTech.com
   System Architect  Web:www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic


[dpdk-dev] [PATCH v2 09/11] eal/pci: replace SYSFS_PCI_DEVICES with pci_get_sysfs_path()

2016-05-12 Thread Thomas Monjalon
2016-05-10 20:13, Jan Viktorin:
> The SYSFS_PCI_DEVICES is a constant that makes the PCI testing difficult as
> it points to an absolute path. We remove using this constant and introducing
> a function pci_get_sysfs_path that gives the same value. However, the user can
> pass a SYSFS_PCI_DEVICES env variable to override the path. It is now possible
> to create a fake sysfs hierarchy for testing.

The headline do not convey the intent. It could be:
pci: allow to override sysfs


[dpdk-dev] [PATCH v2 09/11] eal/pci: replace SYSFS_PCI_DEVICES with pci_get_sysfs_path()

2016-05-12 Thread Thomas Monjalon
2016-05-10 20:13, Jan Viktorin:
> The SYSFS_PCI_DEVICES is a constant that makes the PCI testing difficult as
> it points to an absolute path. We remove using this constant and introducing
> a function pci_get_sysfs_path that gives the same value. However, the user can
> pass a SYSFS_PCI_DEVICES env variable to override the path. It is now possible
> to create a fake sysfs hierarchy for testing.

Yeah!

> + orig = pci_get_sysfs_path();
> + ret = setenv("SYSFS_PCI_DEVICES", "My Documents", 1);

Oh no!

> + TEST_ASSERT_SUCCESS(ret, "Failed setenv to My Documents");
> +
> + path = pci_get_sysfs_path();
> + TEST_ASSERT(strcmp(orig, path),
> + "orig must be different from path: "

I missed something here. Why different?

> +DPDK_16.07 {
> + global:
> +
> + pci_get_sysfs_path;
> +} DPDK_16.04;

I don't know why but we are used to put a blank line after the last symbol.



[dpdk-dev] [dpdk-dev, 2/3] eth_dev: add support for device dma mask

2016-05-12 Thread Jan Viktorin
Hi,

Just a note, please, when replying inline, do not prepend ">" before your
new text. I could not find your replies.

See below...

On Thu, 12 May 2016 16:03:14 +0100
Alejandro Lucero  wrote:

> Hi Jan
> 
> On Thu, May 12, 2016 at 3:52 PM, Jan Viktorin 
> wrote:
> 
> > Hello Alejandro,
> >
> > On Thu, 12 May 2016 15:33:59 +0100
> > "Alejandro.Lucero"  wrote:
> >  
> > > - New dma_mask field in rte_eth_dev_data.
> > >  - If PMD sets device dma_mask, call to check hugepages within  
> >
> > I think that one of the purposes of DPDK is to support DMA transfers
> > in userspace. In other words, I can see no reason to support dma_mask
> > at the ethdev level only.
> >
> > The limitation is a device limitation so I can not see a better place for  
> adding the device dma mask.

That's what I've meant. It is a _device_ limitation. The ethdev is a wrapper
around the rte_pci_device. The ethdev just extends semantics of the generic 
device.
However, all DPDK devices are expected to be DMA-capable.

If you get a pointer to the ethdev, you get a pointer to the rte_pci_device as 
well
(1 more level of dereference but we are not on the fast path here, so it's 
unimportant).

Consider the cryptodev. If cryptodev has some DMA mask requirements we can 
support it
in the generic place, i.e. rte_pci_device and not rte_ethdev because the 
cryptodev
is not an ethdev.

> 
> 
> > We should consider adding this to the generic struct rte_device
> > (currently rte_pci_device). Thomas?
> >
> > I guess it could be a non-pci device with such a limitation. I though  
> rte_ethdev is more generic.

When it is added to the rte_pci_device (or rte_device after the planned changes)
the non-PCI devices get this for free... Otherwise I don't understand the point
here.

> 
> 
> > >supported range.  
> >
> > I think, the '-' is unnecessary at the beginning of line. As for me
> > I prefer a fluent text describing the purpose. The '-' is useful for
> > a real list of notes.
> >  
> > >
> > > Signed-off-by: Alejandro Lucero 
> > >
> > > ---
> > > lib/librte_ether/rte_ethdev.c | 7 +++
> > >  lib/librte_ether/rte_ethdev.h | 1 +
> > >  2 files changed, 8 insertions(+)
> > >
> > > diff --git a/lib/librte_ether/rte_ethdev.c  
> > b/lib/librte_ether/rte_ethdev.c  
> > > index a31018e..c0de88a 100644
> > > --- a/lib/librte_ether/rte_ethdev.c
> > > +++ b/lib/librte_ether/rte_ethdev.c
> > > @@ -280,9 +280,16 @@ rte_eth_dev_init(struct rte_pci_driver *pci_drv,
> > >
> > >   /* Invoke PMD device initialization function */
> > >   diag = (*eth_drv->eth_dev_init)(eth_dev);
> > > + if (diag)
> > > + goto err;
> > > +
> > > + if (eth_dev->data->dma_mask)
> > > + diag =  
> > rte_eal_hugepage_check_address_mask(eth_dev->data->dma_mask);  
> > > +  
> >
> > I don't understand what happens if the memory is out of the DMA mask. What
> > can the driver
> > do? Does it just fail?
> >
> > I think that this should be considered during a malloc instead. (Well,
> > there is probably
> > no suitable API for this at the moment.)
> >
> > hugepage memory allocation is done before device initialization. I see  
> easier to leave the normal hugepage code as it is now and add a later call
> if a device requires it.

True. I didn't meant to change hugepages allocation but to change allocation
of memory for a device. If we reserve a set of hugepages, a subset of them can
match the DMA mask for a particular device. When allocating memory for a device,
we can consider the DMA mask and choose only the hugepages we can handle.

Quite frankly, I am not very aware of the memory allocation internals of DPDK so
I can be wrong...

> 
> The only reasonable thing to do is to fail as the amount of required memory
> can not be (safely) allocated.

This is the easiest way and it is not bad. However, what would be a workaround?
If a user gets some allocated memory but out of the DMA mask, how can she fix 
it?
I don't know... Is it deterministic? Would it always allocate memory out of the
mask or only sometimes?

Jan

> 
> 
> > Regards
> > Jan
> >  
> > >   if (diag == 0)
> > >   return 0;
> > >
> > > +err:
> > >   RTE_PMD_DEBUG_TRACE("driver %s: eth_dev_init(vendor_id=0x%u  
> > device_id=0x%x) failed\n",  
> > >   pci_drv->name,
> > >   (unsigned) pci_dev->id.vendor_id,
> > > diff --git a/lib/librte_ether/rte_ethdev.h  
> > b/lib/librte_ether/rte_ethdev.h  
> > > index 2757510..34daa92 100644
> > > --- a/lib/librte_ether/rte_ethdev.h
> > > +++ b/lib/librte_ether/rte_ethdev.h
> > > @@ -1675,6 +1675,7 @@ struct rte_eth_dev_data {
> > >   enum rte_kernel_driver kdrv;/**< Kernel driver passthrough */
> > >   int numa_node;  /**< NUMA node connection */
> > >   const char *drv_name;   /**< Driver name */
> > > + uint64_t dma_mask; /** device supported address space range */
> > >  };
> > >
> > >  /** Device supports hotplug detach */  
> >
> >
> >
> > --
> >Jan 

[dpdk-dev] [PATCH v2 08/11] app/test: convert current pci_test into a single test case

2016-05-12 Thread Thomas Monjalon
2016-05-10 20:13, Jan Viktorin:
> The current test_pci is just a single test case that tests the blacklisting
> of devices. Rename it to test_pci_blacklist and call it from the test_pci.

The functions are also moved. It is confusing.
Maybe this patch can be squashed with the previous one.



[dpdk-dev] [PATCH v2 06/11] app/test: use linked list to store PCI drivers

2016-05-12 Thread Thomas Monjalon
2016-05-10 20:13, Jan Viktorin:
> The test unregisters all real drivers before starting into an array. This
> inflexiable as we can use a linked list for this purpose.

I don't understand this. Maybe some words are missing.

> +/* real drivers (not used for testing) */

What do mean by "not used for testing"?

> +struct pci_driver_list real_pci_driver_list =
> + TAILQ_HEAD_INITIALIZER(real_pci_driver_list);



[dpdk-dev] [PATCH v1 04/10] app/test: support resources archived by tar

2016-05-12 Thread Thomas Monjalon
2016-05-12 17:26, Thomas Monjalon:
> 2016-05-06 12:48, Jan Viktorin:
> > +$(eval $(call resource,test_resource_tar,resource.tar))
> 
> This tar resource is not provided. Should it be created in the Makefile?

Sorry I was looking at v1 while it is implemented in v2.


[dpdk-dev] [PATCH v2 01/11] app/test: introduce resources for tests

2016-05-12 Thread Jan Viktorin
On Thu, 12 May 2016 17:19:21 +0200
Thomas Monjalon  wrote:

> 2016-05-10 20:13, Jan Viktorin:
> > +REGISTER_TEST_COMMAND(resource_cmd);  
> 
> Should you add this test in group 1 of autotest_data.py?

Will do for v3. This way:

...  
 84 {   

 85  "Name" :   "Common autotest",  

 86  "Command" :"common_autotest",  

 87  "Func" :   default_autotest,   

 88  "Report" : None,   

 89 },  

 90 {   

 91  "Name" :   "Resource autotest",

 92  "Command" :"resource_autotest",

 93  "Func" :   default_autotest,   

 94  "Report" : None,   

 95 },  

 96 {   

 97  "Name" :   "Dump log history", 

 98  "Command" :"dump_log_history", 

 99  "Func" :   dump_autotest,  

100  "Report" : None,   

101 },
...

-- 
   Jan Viktorin  E-mail: Viktorin at RehiveTech.com
   System Architect  Web:www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic


[dpdk-dev] [PATCH v1 04/10] app/test: support resources archived by tar

2016-05-12 Thread Thomas Monjalon
2016-05-06 12:48, Jan Viktorin:
> When needing a more complex resource (a file hierarchy), packing every single
> file as a single resource would be very ineffective. For that purpose, it is
> possible to pack the files into a tar archive, extract it before test from the
> resource and finally clean up all the created files.
> 
> This patch introduces functions resource_untar and resource_rm_by_tar to
> perform those tasks. An example of using those functions is included as a 
> test.
> 
> Signed-off-by: Jan Viktorin 
> ---
>  app/test/Makefile|   4 ++
>  app/test/resource.c  | 180 
> +++
>  app/test/resource.h  |  13 
>  app/test/test_resource.c |  29 
>  4 files changed, 226 insertions(+)
> 
> diff --git a/app/test/Makefile b/app/test/Makefile
> index a9502f1..90acd63 100644
> --- a/app/test/Makefile
> +++ b/app/test/Makefile
> @@ -77,6 +77,9 @@ SRCS-y += test.c
>  SRCS-y += resource.c
>  SRCS-y += test_resource.c
>  $(eval $(call resource,test_resource_c,resource.c))
> +$(eval $(call resource,test_resource_tar,resource.tar))

This tar resource is not provided. Should it be created in the Makefile?


[dpdk-dev] [PATCH v2 01/11] app/test: introduce resources for tests

2016-05-12 Thread Thomas Monjalon
2016-05-10 20:13, Jan Viktorin:
> +REGISTER_TEST_COMMAND(resource_cmd);

Should you add this test in group 1 of autotest_data.py?


[dpdk-dev] [dpdk-dev,3/3] nfp: set device dma mask

2016-05-12 Thread Jan Viktorin
On Thu, 12 May 2016 16:13:55 +0100
Alejandro Lucero  wrote:

> On Thu, May 12, 2016 at 4:03 PM, Jan Viktorin 
> wrote:
> 
> > On Thu, 12 May 2016 15:34:00 +0100
> > "Alejandro.Lucero"  wrote:
> >  
> > > - Just hugepages within the supported range will be available.  
> >
> > Again the hyphen is redundant here.
> >
> > By the way, this text does not describe the change well. If I understood
> > the whole patch set (I am not quite sure now), the initialization would
> > fail if there are hugepages out of the given DMA mask. Am I wrong?
> >
> >  
> You are right.
> 
> 
> > I'd expect something like "NFP supports DMA address in range ...".
> >
> >  
> That is a good idea. I was thinking on adding a memseg dump info as well
> which would help to understand this issue and other related to memory
> allocation.
> 
> 
> > >
> > > Signed-off-by: Alejandro Lucero 
> > >
> > > ---
> > > drivers/net/nfp/nfp_net.c | 11 +++
> > >  1 file changed, 11 insertions(+)
> > >
> > > diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
> > > index ea5a2a3..e0e444a 100644
> > > --- a/drivers/net/nfp/nfp_net.c
> > > +++ b/drivers/net/nfp/nfp_net.c
> > > @@ -115,6 +115,14 @@ enum nfp_qcp_ptr {
> > >   NFP_QCP_WRITE_PTR
> > >  };
> > >
> > > +#ifndef DMA_64BIT_MASK
> > > +#define DMA_64BIT_MASK  0xULL
> > > +#endif
> > > +
> > > +#ifndef DMA_BIT_MASK
> > > +#define DMA_BIT_MASK(n) (((n) == 64) ? DMA_64BIT_MASK : ((1ULL<<(n))-1))
> > > +#endif  
> >
> > This is quite a generic code, I'd put it into the EAL. Probably, it should
> > be renamed to something like RTE_DMA_BIT_MASK.  
> 
> 
> OK.

By the way:

CHECK:SPACING: spaces preferred around that '<<' (ctx:VxV)
#33: FILE: drivers/net/nfp/nfp_net.c:123:
+#define DMA_BIT_MASK(n) (((n) == 64) ? DMA_64BIT_MASK : ((1ULL<<(n))-1))
   ^

CHECK:SPACING: spaces preferred around that '-' (ctx:VxV)
#33: FILE: drivers/net/nfp/nfp_net.c:123:
+#define DMA_BIT_MASK(n) (((n) == 64) ? DMA_64BIT_MASK : ((1ULL<<(n))-1))
 ^

total: 0 errors, 0 warnings, 2 checks, 23 lines checked

> 
> 
> >
> >
> > +  
> > >  /*
> > >   * nfp_qcp_ptr_add - Add the value to the selected pointer of a queue
> > >   * @q: Base address for queue structure
> > > @@ -2441,6 +2449,9 @@ nfp_net_init(struct rte_eth_dev *eth_dev)
> > >   /* Recording current stats counters values */
> > >   nfp_net_stats_reset(eth_dev);
> > >
> > > + /* Setting dma_mask */
> > > + eth_dev->data->dma_mask = DMA_BIT_MASK(40);  
> >
> > Can we read this from /sys/bus/pci/devices/*/dma_mask_bits? I am not sure
> > whether is this generic enough but I can see dma_mask_bits for the PCI
> > devices on my PC.
> >
> >  
> The kernel adds a default dma mask when device scanning (at least for PCI
> devices). It is a device driver who knows about specific DMA addressing
> limitations. For example, this is done with UIO (igb_uio) and the using
> sysfs would be fine (but then you should add support for specifying a dma
> mask in igb_uio as a module param) but this is not true for VFIO.
> 

Makes sense...

Jan

> 
> 
> > Regards
> > Jan
> >  
> > > +
> > >   return 0;
> > >  }
> > >  
> >
> >
> >
> >
> > --
> >Jan Viktorin  E-mail: Viktorin at RehiveTech.com
> >System Architect  Web:www.RehiveTech.com
> >RehiveTech
> >Brno, Czech Republic
> >  



-- 
   Jan Viktorin  E-mail: Viktorin at RehiveTech.com
   System Architect  Web:www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic


[dpdk-dev] [PATCH v2 01/11] app/test: introduce resources for tests

2016-05-12 Thread Thomas Monjalon
2016-05-10 20:13, Jan Viktorin:
> +struct resource {
> + const char *name;  /** Unique name of the resource */
> + const char *begin; /** Start of resource data */
> + const char *end;   /** End of resource data */
> + TAILQ_ENTRY(resource) next;
> +};

There is no doxygen generated from this file, but you can keep
this format in case we decide to generate one.
Here the comments after the fields should start with /**<



[dpdk-dev] [dpdk-dev, 1/3] eal/linux: add function for checking hugepages within device supported address range

2016-05-12 Thread Jan Viktorin
On Thu, 12 May 2016 15:33:58 +0100
"Alejandro.Lucero"  wrote:

> - This is needed for avoiding problems with devices not being able to address
>all the physical available memory.

WARNING:COMMIT_LOG_LONG_LINE: Possible unwrapped commit description (prefer a 
maximum 75 chars per line)
#14: 
- This is needed for avoiding problems with devices not being able to address

WARNING:LONG_LINE: line over 80 characters
#57: FILE: lib/librte_eal/linuxapp/eal/eal_memory.c:1048:
+   const struct rte_mem_config *mcfg = 
rte_eal_get_configuration()->mem_config;

ERROR:SPACING: spaces required around that '=' (ctx:WxV)
#59: FILE: lib/librte_eal/linuxapp/eal/eal_memory.c:1050:
+   int i =0;
  ^

WARNING:LONG_LINE_STRING: line over 80 characters
#63: FILE: lib/librte_eal/linuxapp/eal/eal_memory.c:1054:
+   RTE_LOG(DEBUG, EAL, "Checking page with address %"PRIx64" and 
device"

WARNING:LONG_LINE_STRING: line over 80 characters
#66: FILE: lib/librte_eal/linuxapp/eal/eal_memory.c:1057:
+   RTE_LOG(ERR, EAL, "Allocated hugepages are out of 
device address"

total: 1 errors, 4 warnings, 45 lines checked

Regards
Jan

> 
> Signed-off-by: Alejandro Lucero 
> 
> ---
> lib/librte_eal/common/include/rte_memory.h |  6 ++
>  lib/librte_eal/linuxapp/eal/eal_memory.c   | 27 +++
>  2 files changed, 33 insertions(+)
> 
> diff --git a/lib/librte_eal/common/include/rte_memory.h 
> b/lib/librte_eal/common/include/rte_memory.h
> index f8dbece..67b0b28 100644
> --- a/lib/librte_eal/common/include/rte_memory.h
> +++ b/lib/librte_eal/common/include/rte_memory.h
> @@ -256,6 +256,12 @@ rte_mem_phy2mch(uint32_t memseg_id __rte_unused, const 
> phys_addr_t phy_addr)
>  }
>  #endif
>  
> +/**
> + * Check hugepages are within the supported
> + * device address space range.
> + */
> +int rte_eal_hugepage_check_address_mask(uint64_t dma_mask);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
> b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index 5b9132c..2cd046d 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -1037,6 +1037,33 @@ calc_num_pages_per_socket(uint64_t * memory,
>  }
>  
>  /*
> + * Some devices have addressing limitations. A PMD will indirectly call this
> + * function raising an error if any hugepage is out of address range 
> supported.
> + * As hugepages are ordered by physical address, there is nothing to do as
> + * any other hugepage available will be out of range as well.
> + */
> +int
> +rte_eal_hugepage_check_address_mask(uint64_t dma_mask)
> +{
> + const struct rte_mem_config *mcfg = 
> rte_eal_get_configuration()->mem_config;
> + phys_addr_t physaddr;
> + int i =0;
> +
> + while (i < RTE_MAX_MEMSEG && mcfg->memseg[i].len > 0) {
> + physaddr = mcfg->memseg[i].phys_addr + mcfg->memseg[i].len;
> + RTE_LOG(DEBUG, EAL, "Checking page with address %"PRIx64" and 
> device"
> + " mask 0x%"PRIx64"\n", physaddr, dma_mask);
> + if (physaddr & ~dma_mask) {
> + RTE_LOG(ERR, EAL, "Allocated hugepages are out of 
> device address"
> + " range.");
> + return -1;
> + }
> + i++;
> + }
> + return 0;
> +}
> +
> +/*
>   * Prepare physical memory mapping: fill configuration structure with
>   * these infos, return 0 on success.
>   *  1. map N huge pages in separate files in hugetlbfs



-- 
   Jan Viktorin  E-mail: Viktorin at RehiveTech.com
   System Architect  Web:www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic


[dpdk-dev] [PATCH v1 02/10] app/test: support resources externally linked

2016-05-12 Thread Thomas Monjalon
2016-05-09 17:19, Jan Viktorin:
> On Fri, 06 May 2016 16:32:53 +0200
> Thomas Monjalon  wrote:
> 
> > It looks a lot too much tricky to be integrated without code comments ;)
> > Please make a documentation effort.
> > 
> [...]
> 
> Thomas,
> 
> is it OK to include the PCI test enhancements? Or should I post only the
> fixed "resource framework"?

Yes it is OK.


[dpdk-dev] [dpdk-dev,3/3] nfp: set device dma mask

2016-05-12 Thread Jan Viktorin
On Thu, 12 May 2016 15:34:00 +0100
"Alejandro.Lucero"  wrote:

> - Just hugepages within the supported range will be available.

Again the hyphen is redundant here.

By the way, this text does not describe the change well. If I understood
the whole patch set (I am not quite sure now), the initialization would
fail if there are hugepages out of the given DMA mask. Am I wrong?

I'd expect something like "NFP supports DMA address in range ...".

> 
> Signed-off-by: Alejandro Lucero 
> 
> ---
> drivers/net/nfp/nfp_net.c | 11 +++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
> index ea5a2a3..e0e444a 100644
> --- a/drivers/net/nfp/nfp_net.c
> +++ b/drivers/net/nfp/nfp_net.c
> @@ -115,6 +115,14 @@ enum nfp_qcp_ptr {
>   NFP_QCP_WRITE_PTR
>  };
>  
> +#ifndef DMA_64BIT_MASK
> +#define DMA_64BIT_MASK  0xULL
> +#endif
> +
> +#ifndef DMA_BIT_MASK
> +#define DMA_BIT_MASK(n) (((n) == 64) ? DMA_64BIT_MASK : ((1ULL<<(n))-1))
> +#endif

This is quite a generic code, I'd put it into the EAL. Probably, it should
be renamed to something like RTE_DMA_BIT_MASK.

> +
>  /*
>   * nfp_qcp_ptr_add - Add the value to the selected pointer of a queue
>   * @q: Base address for queue structure
> @@ -2441,6 +2449,9 @@ nfp_net_init(struct rte_eth_dev *eth_dev)
>   /* Recording current stats counters values */
>   nfp_net_stats_reset(eth_dev);
>  
> + /* Setting dma_mask */
> + eth_dev->data->dma_mask = DMA_BIT_MASK(40);

Can we read this from /sys/bus/pci/devices/*/dma_mask_bits? I am not sure
whether is this generic enough but I can see dma_mask_bits for the PCI
devices on my PC.

Regards
Jan

> +
>   return 0;
>  }
>  




-- 
   Jan Viktorin  E-mail: Viktorin at RehiveTech.com
   System Architect  Web:www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic


[dpdk-dev] [PATCH v1 01/10] app/test: introduce resources for tests

2016-05-12 Thread Jan Viktorin
On Thu, 12 May 2016 16:58:31 +0200
Thomas Monjalon  wrote:

> 2016-05-06 18:20, Jan Viktorin:
> > On Fri, 06 May 2016 16:01:08 +0200
> > Thomas Monjalon  wrote:  
> > > 2016-05-06 12:48, Jan Viktorin:  
> > > > +static struct resource linkres_ ##_n = {   \
> > > > +   .name = RTE_STR(_n), \
> > > > +   .beg = _b,   \
> > > > +   .end = _e,   \
> > > > +};   \
> > > > +__REGISTER_RESOURCE(linkres_ ##_n)
> > > 
> > > Please avoid nested macros.  
> > 
> > I don't understand this. I am avoiding code duplication. Is it wrong?  
> 
> Which code duplication?
> Is __REGISTER_RESOURCE used elsewhere?

No :). I've already posted a fix of this.

-- 
   Jan Viktorin  E-mail: Viktorin at RehiveTech.com
   System Architect  Web:www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic


[dpdk-dev] [PATCH v1 01/10] app/test: introduce resources for tests

2016-05-12 Thread Thomas Monjalon
2016-05-06 18:20, Jan Viktorin:
> On Fri, 06 May 2016 16:01:08 +0200
> Thomas Monjalon  wrote:
> > 2016-05-06 12:48, Jan Viktorin:
> > > +static struct resource linkres_ ##_n = {   \
> > > + .name = RTE_STR(_n), \
> > > + .beg = _b,   \
> > > + .end = _e,   \
> > > +};   \
> > > +__REGISTER_RESOURCE(linkres_ ##_n)  
> > 
> > Please avoid nested macros.
> 
> I don't understand this. I am avoiding code duplication. Is it wrong?

Which code duplication?
Is __REGISTER_RESOURCE used elsewhere?


[dpdk-dev] [dpdk-dev, 2/3] eth_dev: add support for device dma mask

2016-05-12 Thread Jan Viktorin
Hello Alejandro,

On Thu, 12 May 2016 15:33:59 +0100
"Alejandro.Lucero"  wrote:

> - New dma_mask field in rte_eth_dev_data.
>  - If PMD sets device dma_mask, call to check hugepages within

I think that one of the purposes of DPDK is to support DMA transfers
in userspace. In other words, I can see no reason to support dma_mask
at the ethdev level only.

We should consider adding this to the generic struct rte_device
(currently rte_pci_device). Thomas?

>supported range.

I think, the '-' is unnecessary at the beginning of line. As for me
I prefer a fluent text describing the purpose. The '-' is useful for
a real list of notes.

> 
> Signed-off-by: Alejandro Lucero 
> 
> ---
> lib/librte_ether/rte_ethdev.c | 7 +++
>  lib/librte_ether/rte_ethdev.h | 1 +
>  2 files changed, 8 insertions(+)
> 
> diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
> index a31018e..c0de88a 100644
> --- a/lib/librte_ether/rte_ethdev.c
> +++ b/lib/librte_ether/rte_ethdev.c
> @@ -280,9 +280,16 @@ rte_eth_dev_init(struct rte_pci_driver *pci_drv,
>  
>   /* Invoke PMD device initialization function */
>   diag = (*eth_drv->eth_dev_init)(eth_dev);
> + if (diag)
> + goto err;
> +
> + if (eth_dev->data->dma_mask)
> + diag = 
> rte_eal_hugepage_check_address_mask(eth_dev->data->dma_mask);
> +

I don't understand what happens if the memory is out of the DMA mask. What can 
the driver
do? Does it just fail?

I think that this should be considered during a malloc instead. (Well, there is 
probably
no suitable API for this at the moment.)

Regards
Jan

>   if (diag == 0)
>   return 0;
>  
> +err:
>   RTE_PMD_DEBUG_TRACE("driver %s: eth_dev_init(vendor_id=0x%u 
> device_id=0x%x) failed\n",
>   pci_drv->name,
>   (unsigned) pci_dev->id.vendor_id,
> diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
> index 2757510..34daa92 100644
> --- a/lib/librte_ether/rte_ethdev.h
> +++ b/lib/librte_ether/rte_ethdev.h
> @@ -1675,6 +1675,7 @@ struct rte_eth_dev_data {
>   enum rte_kernel_driver kdrv;/**< Kernel driver passthrough */
>   int numa_node;  /**< NUMA node connection */
>   const char *drv_name;   /**< Driver name */
> + uint64_t dma_mask; /** device supported address space range */
>  };
>  
>  /** Device supports hotplug detach */



-- 
   Jan Viktorin  E-mail: Viktorin at RehiveTech.com
   System Architect  Web:www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic


[dpdk-dev] [dpdk-dev,3/3] nfp: set device dma mask

2016-05-12 Thread Alejandro Lucero
On Thu, May 12, 2016 at 4:03 PM, Jan Viktorin 
wrote:

> On Thu, 12 May 2016 15:34:00 +0100
> "Alejandro.Lucero"  wrote:
>
> > - Just hugepages within the supported range will be available.
>
> Again the hyphen is redundant here.
>
> By the way, this text does not describe the change well. If I understood
> the whole patch set (I am not quite sure now), the initialization would
> fail if there are hugepages out of the given DMA mask. Am I wrong?
>
>
You are right.


> I'd expect something like "NFP supports DMA address in range ...".
>
>
That is a good idea. I was thinking on adding a memseg dump info as well
which would help to understand this issue and other related to memory
allocation.


> >
> > Signed-off-by: Alejandro Lucero 
> >
> > ---
> > drivers/net/nfp/nfp_net.c | 11 +++
> >  1 file changed, 11 insertions(+)
> >
> > diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
> > index ea5a2a3..e0e444a 100644
> > --- a/drivers/net/nfp/nfp_net.c
> > +++ b/drivers/net/nfp/nfp_net.c
> > @@ -115,6 +115,14 @@ enum nfp_qcp_ptr {
> >   NFP_QCP_WRITE_PTR
> >  };
> >
> > +#ifndef DMA_64BIT_MASK
> > +#define DMA_64BIT_MASK  0xULL
> > +#endif
> > +
> > +#ifndef DMA_BIT_MASK
> > +#define DMA_BIT_MASK(n) (((n) == 64) ? DMA_64BIT_MASK : ((1ULL<<(n))-1))
> > +#endif
>
> This is quite a generic code, I'd put it into the EAL. Probably, it should
> be renamed to something like RTE_DMA_BIT_MASK.


OK.


>
>
> +
> >  /*
> >   * nfp_qcp_ptr_add - Add the value to the selected pointer of a queue
> >   * @q: Base address for queue structure
> > @@ -2441,6 +2449,9 @@ nfp_net_init(struct rte_eth_dev *eth_dev)
> >   /* Recording current stats counters values */
> >   nfp_net_stats_reset(eth_dev);
> >
> > + /* Setting dma_mask */
> > + eth_dev->data->dma_mask = DMA_BIT_MASK(40);
>
> Can we read this from /sys/bus/pci/devices/*/dma_mask_bits? I am not sure
> whether is this generic enough but I can see dma_mask_bits for the PCI
> devices on my PC.
>
>
The kernel adds a default dma mask when device scanning (at least for PCI
devices). It is a device driver who knows about specific DMA addressing
limitations. For example, this is done with UIO (igb_uio) and the using
sysfs would be fine (but then you should add support for specifying a dma
mask in igb_uio as a module param) but this is not true for VFIO.



> Regards
> Jan
>
> > +
> >   return 0;
> >  }
> >
>
>
>
>
> --
>Jan Viktorin  E-mail: Viktorin at RehiveTech.com
>System Architect  Web:www.RehiveTech.com
>RehiveTech
>Brno, Czech Republic
>


[dpdk-dev] [PATCH] i40e: fix flexible payload selection

2016-05-12 Thread Jingjing Wu
When setting up flexible payload selection rules, it is allowed
that setting value to 63 to disable the rule (NONUSE_FLX_PIT_DEST_OFF).
However, MK_FLX_PIT macro is always adding an offset value 50
(I40E_FLX_OFFSET_IN_FIELD_VECTOR), it will be set to "63 + 50" and
when setting NONUSE_FLX_PIT_DEST_OFF to disable it. It breaks
the functionality.
This patch fixes this issue.

Fixes: d8b90c4eabe9 ("i40e: take flow director flexible payload
  configuration")

Reported-by: Michael Habibi 
Signed-off-by: Jingjing Wu 
---
 drivers/net/i40e/i40e_fdir.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/i40e/i40e_fdir.c b/drivers/net/i40e/i40e_fdir.c
index 8aa41e5..efbcd18 100644
--- a/drivers/net/i40e/i40e_fdir.c
+++ b/drivers/net/i40e/i40e_fdir.c
@@ -94,7 +94,9 @@
I40E_PRTQF_FLX_PIT_SOURCE_OFF_MASK) | \
(((fsize) << I40E_PRTQF_FLX_PIT_FSIZE_SHIFT) & \
I40E_PRTQF_FLX_PIT_FSIZE_MASK) | \
-   dst_offset) + I40E_FLX_OFFSET_IN_FIELD_VECTOR) << \
+   dst_offset) == NONUSE_FLX_PIT_DEST_OFF ? \
+   NONUSE_FLX_PIT_DEST_OFF : \
+   ((dst_offset) + I40E_FLX_OFFSET_IN_FIELD_VECTOR)) << \
I40E_PRTQF_FLX_PIT_DEST_OFF_SHIFT) & \
I40E_PRTQF_FLX_PIT_DEST_OFF_MASK))

-- 
2.4.0



[dpdk-dev] [dpdk-dev, 2/3] eth_dev: add support for device dma mask

2016-05-12 Thread Alejandro Lucero
Hi Jan

On Thu, May 12, 2016 at 3:52 PM, Jan Viktorin 
wrote:

> Hello Alejandro,
>
> On Thu, 12 May 2016 15:33:59 +0100
> "Alejandro.Lucero"  wrote:
>
> > - New dma_mask field in rte_eth_dev_data.
> >  - If PMD sets device dma_mask, call to check hugepages within
>
> I think that one of the purposes of DPDK is to support DMA transfers
> in userspace. In other words, I can see no reason to support dma_mask
> at the ethdev level only.
>
> The limitation is a device limitation so I can not see a better place for
adding the device dma mask.


> We should consider adding this to the generic struct rte_device
> (currently rte_pci_device). Thomas?
>
> I guess it could be a non-pci device with such a limitation. I though
rte_ethdev is more generic.


> >supported range.
>
> I think, the '-' is unnecessary at the beginning of line. As for me
> I prefer a fluent text describing the purpose. The '-' is useful for
> a real list of notes.
>
> >
> > Signed-off-by: Alejandro Lucero 
> >
> > ---
> > lib/librte_ether/rte_ethdev.c | 7 +++
> >  lib/librte_ether/rte_ethdev.h | 1 +
> >  2 files changed, 8 insertions(+)
> >
> > diff --git a/lib/librte_ether/rte_ethdev.c
> b/lib/librte_ether/rte_ethdev.c
> > index a31018e..c0de88a 100644
> > --- a/lib/librte_ether/rte_ethdev.c
> > +++ b/lib/librte_ether/rte_ethdev.c
> > @@ -280,9 +280,16 @@ rte_eth_dev_init(struct rte_pci_driver *pci_drv,
> >
> >   /* Invoke PMD device initialization function */
> >   diag = (*eth_drv->eth_dev_init)(eth_dev);
> > + if (diag)
> > + goto err;
> > +
> > + if (eth_dev->data->dma_mask)
> > + diag =
> rte_eal_hugepage_check_address_mask(eth_dev->data->dma_mask);
> > +
>
> I don't understand what happens if the memory is out of the DMA mask. What
> can the driver
> do? Does it just fail?
>
> I think that this should be considered during a malloc instead. (Well,
> there is probably
> no suitable API for this at the moment.)
>
> hugepage memory allocation is done before device initialization. I see
easier to leave the normal hugepage code as it is now and add a later call
if a device requires it.

The only reasonable thing to do is to fail as the amount of required memory
can not be (safely) allocated.


> Regards
> Jan
>
> >   if (diag == 0)
> >   return 0;
> >
> > +err:
> >   RTE_PMD_DEBUG_TRACE("driver %s: eth_dev_init(vendor_id=0x%u
> device_id=0x%x) failed\n",
> >   pci_drv->name,
> >   (unsigned) pci_dev->id.vendor_id,
> > diff --git a/lib/librte_ether/rte_ethdev.h
> b/lib/librte_ether/rte_ethdev.h
> > index 2757510..34daa92 100644
> > --- a/lib/librte_ether/rte_ethdev.h
> > +++ b/lib/librte_ether/rte_ethdev.h
> > @@ -1675,6 +1675,7 @@ struct rte_eth_dev_data {
> >   enum rte_kernel_driver kdrv;/**< Kernel driver passthrough */
> >   int numa_node;  /**< NUMA node connection */
> >   const char *drv_name;   /**< Driver name */
> > + uint64_t dma_mask; /** device supported address space range */
> >  };
> >
> >  /** Device supports hotplug detach */
>
>
>
> --
>Jan Viktorin  E-mail: Viktorin at RehiveTech.com
>System Architect  Web:www.RehiveTech.com
>RehiveTech
>Brno, Czech Republic
>


[dpdk-dev] [PATCHv3 1/2] config/armv8a: disable igb_uio

2016-05-12 Thread Santosh Shukla
On Thu, May 12, 2016 at 05:52:54PM +0800, Jianbo Liu wrote:
> On 12 May 2016 at 16:57, Santosh Shukla
>  wrote:
> > On Thu, May 12, 2016 at 01:54:13PM +0800, Jianbo Liu wrote:
> >> On 12 May 2016 at 13:06, Santosh Shukla
> >>  wrote:
> >> > On Thu, May 12, 2016 at 11:42:26AM +0800, Jianbo Liu wrote:
> >> >> On 12 May 2016 at 11:17, Santosh Shukla
> >> >>  wrote:
> >> >> > On Thu, May 12, 2016 at 10:01:05AM +0800, Jianbo Liu wrote:
> >> >> >> On 12 May 2016 at 02:25, Stephen Hemminger  >> >> >> networkplumber.org> wrote:
> >> >> >> > On Wed, 11 May 2016 22:32:16 +0530
> >> >> >> > Jerin Jacob  wrote:
> >> >> >> >
> >> >> >> >> On Wed, May 11, 2016 at 08:22:59AM -0700, Stephen Hemminger wrote:
> >> >> >> >> > On Wed, 11 May 2016 19:17:58 +0530
> >> >> >> >> > Hemant Agrawal  wrote:
> >> >> >> >> >
> >> >> >> >> > > IGB_UIO not supported for arm64 arch in kernel so disable.
> >> >> >> >> > >
> >> >> >> >> > > Signed-off-by: Hemant Agrawal 
> >> >> >> >> > > Reviewed-by: Santosh Shukla  >> >> >> >> > > caviumnetworks.com>
> >> >> >> >> >
> >> >> >> >> > Really, I have use IGB_UIO on ARM64
> >> >> >> >>
> >> >> >> >> May I know what is the technical use case for igb_uio on arm64
> >> >> >> >> which cannot be addressed through vfio or vfioionommu.
> >> >> >> >
> >> >> >> > I was running on older kernel which did not support vfioionommu 
> >> >> >> > mode.
> >> >> >>
> >> >> >> As I said, most of DPDK developers are not kernel developers. They 
> >> >> >> may
> >> >> >> have their own kernel tree, and couldn't like to upgrade to latest
> >> >> >> kernel.
> >> >> >> They can choose to use or not use igb_uio when binding the driver. 
> >> >> >> But
> >> >> >> blindly disabling it in the base config seems unreasonable.
> >> >> >
> >> >> > if user keeping his own kernel so they could also keep IGB_UIO=y in 
> >> >> > their local
> >> >> Most likely they don't have local dpdk tree. They write their own
> >> >> applications, complie and link to dpdk lib, then done.
> >> >>
> >> >> > dpdk tree. Why are you imposing user-x custome depedancy on upstream 
> >> >> > dpdk base
> >> >> Customer requiremnts is important. I want they can choose the way they 
> >> >> like.
> >> >>
> >> >
> >> > so you choose to keep igb_uio option, provided arch doesn't support?
> >> > new user did reported issues with igb_uio for arm64, refer this thread 
> >> > [1], as
> >> > well hemanth too faced issues. we want to avoid that.
> >> >
> >> > If customer maintaing out-of-tree kernel then he can also switch to 
> >> > vfio-way.
> >> > isn;t it?
> >> >
> >> >> > config. Is it not enough for explanation that - Base config ie.. 
> >> >> > armv8 doesn;t
> >> >> > support pci mmap, so igb_uio is n/a. New user wont able to build/run 
> >> >> > dpdk/arm64
> >> >> > in igb_uio-way, He'll prefer to use upstream stuff. I think, you are 
> >> >> > not making
> >> >> You are wrong, he can build dpdk. If he like to use upstream without
> >> >> patching, he can use vfio.
> >> >
> >> > I disagree, we want to avoid [1] for new user.
> >> >
> >> >> But you can't ignore the need from old user which is more comfortable
> >> >> with older kernel.
> >> >>
> >> > arm/arm64 dpdk support recently added and I am guessing, most likely 
> >> > customer
> >> > using near latest kernel, switching to vfio won't be so difficult.
> >> >
> >> > Or can you take up responsibility of upstreaming pci mmap patch, then we 
> >> > don't
> >> > need this patch.
> >> >
> >> > [1] http://dpdk.org/ml/archives/dev/2016-January/031313.html
> >>
> >> Can you read carefully about the guide at
> >> http://dpdk.org/doc/guides/linux_gsg/build_dpdk.html? It says to use
> >> uio_pci_generic, igb_uio or vfio-pci.
> >
> > *** applicable and works for x86 only, not for arm64: because pci mmap 
> > support
> > not present for arm64, in that case we should update the doc.
> >
> >> Could it be possible that the user in that thread has already read and
> >> tried them all and found that he can't enable vifo with his kernel,
> >> and igb_uio is the easy way for him and asked for help from community?
> >> If so, we have no choice but keeping igb_uio enabled.
> >
> > By then vfionoiommu support was wip progress in dpdk/linux. but now it 
> > merged
> > and it works. So no need to retain igb_uio in base config for which to work 
> > -
> > user need to use mmap patch at linux side.
> 
> We can't decide which kernel user will use.
>

yes, we can't decide kernel for user but we should be explicit to user on - what
works for dpdk/linux out-of-box vs what could work with use of out-of-tree
patch/RFC's.. example igb_uio.

> >
> > Or can you maintain out-of-tree pci mmap patch/ kerne source and make it
> > explicit somewhere in dpdk build doc that - if user want igb_uio way then
> > use kernel/mmap patch from x location.
> 
> The patch is in the kernel maillist, and user google it.

there are feature specific rfc's in plenty in lkml/qemu mailing list,  and you
suggest- user to hunt for all those information. Is 

[dpdk-dev] [PATCH 01/20] thunderx/nicvf/base: add hardware API for ThunderX nicvf inbuilt NIC

2016-05-12 Thread Pattan, Reshma


> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Jerin Jacob
> Sent: Saturday, May 7, 2016 4:16 PM
> To: dev at dpdk.org
> Cc: thomas.monjalon at 6wind.com; Richardson, Bruce
> ; Jerin Jacob
> ; Maciej Czekaj
> ; Kamil Rytarowski
> ; Zyta Szpak
> ; Slawomir Rosek ;
> Radoslaw Biernacki 
> Subject: [dpdk-dev] [PATCH 01/20] thunderx/nicvf/base: add hardware API for
> ThunderX nicvf inbuilt NIC
> 
> +int
> +nicvf_reg_poll_interrupts(struct nicvf *nic)
> +{
> + int msg = 0;
> + uint64_t intr;
> +
> + intr = nicvf_reg_read(nic, NIC_VF_INT);
> + if (intr & NICVF_INTR_MBOX_MASK) {
> + nicvf_reg_write(nic, NIC_VF_INT, NICVF_INTR_MBOX_MASK);
> + msg = nicvf_handle_mbx_intr(nic);
> + }
> + if (intr & NICVF_INTR_QS_ERR_MASK) {
> + nicvf_reg_write(nic, NIC_VF_INT, NICVF_INTR_QS_ERR_MASK);
> + nicvf_handle_qset_err_intr(nic);
> + }
> + return msg;
> +}
> +
> +
[Reshma]: Multiple blank lines

> +int
> +nicvf_qset_rbdr_reclaim(struct nicvf *nic, uint16_t qidx)
> +{
> + uint64_t status;
> + int timeout = 10;
> + struct nicvf_rbdr *rbdr = nic->rbdr;
> +
> + /* Save head and tail pointers for freeing up buffers */
> + if (rbdr) {
> + rbdr->head = nicvf_queue_reg_read(nic,
> +   NIC_QSET_RBDR_0_1_HEAD,
> +   qidx) >> 3;
> + rbdr->tail = nicvf_queue_reg_read(nic,
> +   NIC_QSET_RBDR_0_1_TAIL,
> +   qidx) >> 3;

 [Reshma]: Mix of tabs + spaces, u can use all tabs, U can correct this for 
other parts of  the file and other files too. 

Thanks,
Reshma



[dpdk-dev] [PATCH 04/20] thunderx/nicvf: add get_reg and get_reg_length support

2016-05-12 Thread Pattan, Reshma


> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Jerin Jacob
> Sent: Saturday, May 7, 2016 4:16 PM
> To: dev at dpdk.org
> Cc: thomas.monjalon at 6wind.com; Richardson, Bruce
> ; Jerin Jacob
> ; Maciej Czekaj
> ; Kamil Rytarowski
> ; Zyta Szpak
> ; Slawomir Rosek ;
> Radoslaw Biernacki 
> Subject: [dpdk-dev] [PATCH 04/20] thunderx/nicvf: add get_reg and
> get_reg_length support
> 
> +
> +static int
> +nicvf_dev_get_regs(struct rte_eth_dev *dev, struct rte_dev_reg_info
> +*regs) {
> + uint64_t *data = regs->data;
> + struct nicvf *nic = nicvf_pmd_priv(dev);
> +
> + if (data == NULL)
> + return -EINVAL;

nicvf_reg_dump prints to stdout if data in NULL, so do we still want to return 
here?

Thanks,
Reshma





[dpdk-dev] [PATCH 3/3] nfp: set device dma mask

2016-05-12 Thread Alejandro Lucero
 - Just hugepages within the supported range will be available.

Signed-off-by: Alejandro Lucero 
---
 drivers/net/nfp/nfp_net.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
index ea5a2a3..e0e444a 100644
--- a/drivers/net/nfp/nfp_net.c
+++ b/drivers/net/nfp/nfp_net.c
@@ -115,6 +115,14 @@ enum nfp_qcp_ptr {
NFP_QCP_WRITE_PTR
 };

+#ifndef DMA_64BIT_MASK
+#define DMA_64BIT_MASK  0xULL
+#endif
+
+#ifndef DMA_BIT_MASK
+#define DMA_BIT_MASK(n) (((n) == 64) ? DMA_64BIT_MASK : ((1ULL<<(n))-1))
+#endif
+
 /*
  * nfp_qcp_ptr_add - Add the value to the selected pointer of a queue
  * @q: Base address for queue structure
@@ -2441,6 +2449,9 @@ nfp_net_init(struct rte_eth_dev *eth_dev)
/* Recording current stats counters values */
nfp_net_stats_reset(eth_dev);

+   /* Setting dma_mask */
+   eth_dev->data->dma_mask = DMA_BIT_MASK(40);
+
return 0;
 }

-- 
1.9.1



[dpdk-dev] [PATCH 2/3] eth_dev: add support for device dma mask

2016-05-12 Thread Alejandro Lucero
 - New dma_mask field in rte_eth_dev_data.
 - If PMD sets device dma_mask, call to check hugepages within
   supported range.

Signed-off-by: Alejandro Lucero 
---
 lib/librte_ether/rte_ethdev.c | 7 +++
 lib/librte_ether/rte_ethdev.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index a31018e..c0de88a 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -280,9 +280,16 @@ rte_eth_dev_init(struct rte_pci_driver *pci_drv,

/* Invoke PMD device initialization function */
diag = (*eth_drv->eth_dev_init)(eth_dev);
+   if (diag)
+   goto err;
+
+   if (eth_dev->data->dma_mask)
+   diag = 
rte_eal_hugepage_check_address_mask(eth_dev->data->dma_mask);
+
if (diag == 0)
return 0;

+err:
RTE_PMD_DEBUG_TRACE("driver %s: eth_dev_init(vendor_id=0x%u 
device_id=0x%x) failed\n",
pci_drv->name,
(unsigned) pci_dev->id.vendor_id,
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 2757510..34daa92 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -1675,6 +1675,7 @@ struct rte_eth_dev_data {
enum rte_kernel_driver kdrv;/**< Kernel driver passthrough */
int numa_node;  /**< NUMA node connection */
const char *drv_name;   /**< Driver name */
+   uint64_t dma_mask; /** device supported address space range */
 };

 /** Device supports hotplug detach */
-- 
1.9.1



[dpdk-dev] [PATCH 1/3] eal/linux: add function for checking hugepages within device supported address range

2016-05-12 Thread Alejandro Lucero
 - This is needed for avoiding problems with devices not being able to address
   all the physical available memory.

Signed-off-by: Alejandro Lucero 
---
 lib/librte_eal/common/include/rte_memory.h |  6 ++
 lib/librte_eal/linuxapp/eal/eal_memory.c   | 27 +++
 2 files changed, 33 insertions(+)

diff --git a/lib/librte_eal/common/include/rte_memory.h 
b/lib/librte_eal/common/include/rte_memory.h
index f8dbece..67b0b28 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -256,6 +256,12 @@ rte_mem_phy2mch(uint32_t memseg_id __rte_unused, const 
phys_addr_t phy_addr)
 }
 #endif

+/**
+ * Check hugepages are within the supported
+ * device address space range.
+ */
+int rte_eal_hugepage_check_address_mask(uint64_t dma_mask);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..2cd046d 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1037,6 +1037,33 @@ calc_num_pages_per_socket(uint64_t * memory,
 }

 /*
+ * Some devices have addressing limitations. A PMD will indirectly call this
+ * function raising an error if any hugepage is out of address range supported.
+ * As hugepages are ordered by physical address, there is nothing to do as
+ * any other hugepage available will be out of range as well.
+ */
+int
+rte_eal_hugepage_check_address_mask(uint64_t dma_mask)
+{
+   const struct rte_mem_config *mcfg = 
rte_eal_get_configuration()->mem_config;
+   phys_addr_t physaddr;
+   int i =0;
+
+   while (i < RTE_MAX_MEMSEG && mcfg->memseg[i].len > 0) {
+   physaddr = mcfg->memseg[i].phys_addr + mcfg->memseg[i].len;
+   RTE_LOG(DEBUG, EAL, "Checking page with address %"PRIx64" and 
device"
+   " mask 0x%"PRIx64"\n", physaddr, dma_mask);
+   if (physaddr & ~dma_mask) {
+   RTE_LOG(ERR, EAL, "Allocated hugepages are out of 
device address"
+   " range.");
+   return -1;
+   }
+   i++;
+   }
+   return 0;
+}
+
+/*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
  *  1. map N huge pages in separate files in hugetlbfs
-- 
1.9.1



[dpdk-dev] [PATCH 0/3] add support for devices with addressing limitations

2016-05-12 Thread Alejandro Lucero
A kernel driver uses a dma mask specifying the memory address range supported
by the device for DMA operations. With DPDK there is no possibility for doing
the same thing so it could lead to problems with those devices not being able
to use all the available physical memory.

This patchset adds support for a PMD setting a device dma mask. If this dma
mask is set this will imply a call for checking hugepages allocated are within
the supported device range.

First patch adds the checking function. If there is a hugepage (memseg) out of
the device supported range an error is raised. Nothing really we can do as any
other available hugepage (and not allocated) will be also out of range as
hugepages are ordered by physical address before allocating.

Second patch adds call to the checking function if device dma mask is set during
PMD initialization. Depending on how hugepages are created and the amount of 
them
the checking could slow down initialization. If a device has not addressing 
limitations the checking is not done.

Third patch adds support for setting dma mask in the PMD NFP. Current NFP card
just supports 40 bits. Future versions will support 64 bits.

Alejandro Lucero (3):
  eal/linux: add function for checking hugepages within device supported
address range
  eth_dev: add support for device dma mask
  nfp: set device dma mask

 drivers/net/nfp/nfp_net.c  | 11 +++
 lib/librte_eal/common/include/rte_memory.h |  6 ++
 lib/librte_eal/linuxapp/eal/eal_memory.c   | 27 +++
 lib/librte_ether/rte_ethdev.c  |  7 +++
 lib/librte_ether/rte_ethdev.h  |  1 +
 5 files changed, 52 insertions(+)

-- 
1.9.1



[dpdk-dev] [PATCH] i40e: fix link management

2016-05-12 Thread Jingjing Wu
Previously, there was a known issue "On Intel? 40G Ethernet
Controller stopping the port does not really down the port link."
There were two reasons why the port is always kept up.
1. Old version Firmware would cause issue when call "Set PHY config
command" on 40G NIC.
2. Because linux kernel i40e driver didn?t call "Set PHY config
command" when ifconfig up/down, and it assumes the link always up.
While ports are forced down when DPDK quit. So if the port is switched
to controlled by kernel driver, the port will not be up through
"ifconfig  up".

This patch fixes this issue by reopening "Set PHY config command"
because:
1. New firmware issue is already fixed.
2. After DPDK quit, "ethtool -s  autoneg on" can be used to
turn on the auto negotiation, and then port can be up through
"ifconfig  up" in new version kernel i40e driver( >1.4.X ).

Fixes: 2f1e22817420 ("i40e: skip link control as firmware workaround")
Fixes: 16c979f9adf2 ("i40e: disable setting of PHY configuration")
Signed-off-by: Jingjing Wu 
---
 doc/guides/rel_notes/known_issues.rst | 19 -
 drivers/net/i40e/i40e_ethdev.c| 72 +--
 2 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/doc/guides/rel_notes/known_issues.rst 
b/doc/guides/rel_notes/known_issues.rst
index 923a202..489bb92 100644
--- a/doc/guides/rel_notes/known_issues.rst
+++ b/doc/guides/rel_notes/known_issues.rst
@@ -532,25 +532,6 @@ Cannot set link speed on Intel? 40G Ethernet controller
Poll Mode Driver (PMD).


-Stopping the port does not down the link on Intel? 40G Ethernet controller
---
-
-**Description**:
-   On Intel? 40G Ethernet Controller stopping the port does not really down 
the port link.
-
-**Implication**:
-   The port link will be still up after stopping the port.
-
-**Resolution/Workaround**:
-   None
-
-**Affected Environment/Platform**:
-   All.
-
-**Driver/Module**:
-   Poll Mode Driver (PMD).
-
-
 Devices bound to igb_uio with VT-d enabled do not work on Linux kernel 
3.15-3.17
 


diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 24777d5..4236d07 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -1403,15 +1403,58 @@ i40e_parse_link_speeds(uint16_t link_speeds)
 }

 static int
-i40e_phy_conf_link(__rte_unused struct i40e_hw *hw,
-  __rte_unused uint8_t abilities,
-  __rte_unused uint8_t force_speed)
-{
-   /* Skip any phy config on both 10G and 40G interfaces, as a workaround
-* for the link control limitation of that all link control should be
-* handled by firmware. It should follow up if link control will be
-* opened to software driver in future firmware versions.
-*/
+i40e_phy_conf_link(struct i40e_hw *hw,
+  uint8_t abilities,
+  uint8_t force_speed)
+{
+   enum i40e_status_code status;
+   struct i40e_aq_get_phy_abilities_resp phy_ab;
+   struct i40e_aq_set_phy_config phy_conf;
+   const uint8_t mask = I40E_AQ_PHY_FLAG_PAUSE_TX |
+   I40E_AQ_PHY_FLAG_PAUSE_RX |
+   I40E_AQ_PHY_FLAG_PAUSE_RX |
+   I40E_AQ_PHY_FLAG_LOW_POWER;
+   const uint8_t advt = I40E_LINK_SPEED_40GB |
+   I40E_LINK_SPEED_10GB |
+   I40E_LINK_SPEED_1GB |
+   I40E_LINK_SPEED_100MB;
+   int ret = -ENOTSUP;
+
+
+   status = i40e_aq_get_phy_capabilities(hw, false, false, _ab,
+ NULL);
+   if (status)
+   return ret;
+
+   memset(_conf, 0, sizeof(phy_conf));
+
+   /* bits 0-2 use the values from get_phy_abilities_resp */
+   abilities &= ~mask;
+   abilities |= phy_ab.abilities & mask;
+
+   /* update ablities and speed */
+   if (abilities & I40E_AQ_PHY_AN_ENABLED)
+   phy_conf.link_speed = advt;
+   else
+   phy_conf.link_speed = force_speed;
+
+   phy_conf.abilities = abilities;
+
+   /* use get_phy_abilities_resp value for the rest */
+   phy_conf.phy_type = phy_ab.phy_type;
+   phy_conf.eee_capability = phy_ab.eee_capability;
+   phy_conf.eeer = phy_ab.eeer_val;
+   phy_conf.low_power_ctrl = phy_ab.d3_lpan;
+
+   PMD_DRV_LOG(DEBUG, "\tCurrent: abilities %x, link_speed %x",
+   phy_ab.abilities, phy_ab.link_speed);
+   PMD_DRV_LOG(DEBUG, "\tConfig:  abilities %x, link_speed %x",
+   phy_conf.abilities, phy_conf.link_speed);
+
+   status = i40e_aq_set_phy_config(hw, _conf, NULL);
+   if (status)
+   return ret;
+
return I40E_SUCCESS;
 }

@@ -1427,8 +1470,13 @@ i40e_apply_link_speed(struct rte_eth_dev *dev)
abilities |= I40E_AQ_PHY_ENABLE_ATOMIC_LINK;

[dpdk-dev] [PATCH v4 6/8] virtio-user: add new virtual pci driver for virtio

2016-05-12 Thread Tan, Jianfeng
Hi yuanhan,


On 5/12/2016 10:12 AM, Yuanhan Liu wrote:
> On Fri, Apr 29, 2016 at 01:18:34AM +, Jianfeng Tan wrote:
>> +static void
>> +vdev_read_dev_config(struct virtio_hw *hw, uint64_t offset,
>> + void *dst, int length)
>> +{
>> +int i;
>> +struct virtio_user_hw *uhw = (struct virtio_user_hw *)hw->vdev_private;
> Unnecessary cast.

Yes.

>
>> +static int
>> +vdev_setup_queue(struct virtio_hw *hw __rte_unused, struct virtqueue *vq)
>> +{
>> +/* Changed to use virtual addr */
>> +vq->vq_ring_mem = (phys_addr_t)vq->mz->addr;
>> +if (vq->virtio_net_hdr_mz) {
>> +vq->virtio_net_hdr_mem =
>> +(phys_addr_t)vq->virtio_net_hdr_mz->addr;
>> +/* Do it one more time after we reset virtio_net_hdr_mem */
>> +vring_hdr_desc_init(vq);
>> +}
>> +vq->offset = offsetof(struct rte_mbuf, buf_addr);
>> +return 0;
> Here as last email said, you should not mix vq stuff. What's more,
> why do you invoke vring_hdr_desc_init() here?

vring_hdr_desc_init() is to init header desc according to 
vq->virtio_net_hdr_mem, and here we change to use virtual address, so we 
need to invoke this after vq->virtio_net_hdr_mem is decided.

But for this case, you remind me that we can achieve that by: inside 
virtio_dev_queue_setup(), move vring_hdr_desc_init() after setup_queue().

> If it needs a special
> handling, do it in driver.

As discussed in previous mail with David, we should hide special 
handling inside pci ops, such as real virtio device needs to check 
address (patch 1). See below comments for more detail.

>
> The "setup_queue" method is actually for telling the device where desc,
> avail and used vring are located. Hence, the implementation could be simple:
> just log them.

>
>> +
>> +const struct virtio_pci_ops vdev_ops = {
> Note that this is the interface for the driver to talk to the device,
> we should put this file into upper layer then, in the driver.
>
> And let me make a summary, trying to make it clear:
>
> - We should not use any structures/functions from the virtio driver
>here, unless it's really a must.

Firstly I agree this point (although I see a difference in how we take 
"a must"). My original principle is to maximize the use of existing 
structures instead of maintain any new ones. And I already give up that 
principle when I accept your previous suggestion to use struct 
virtio_user_device to store virtio-user specific fields. So I agree to 
add the new struct virtqueue to avoid use of driver-layer virtqueues.

>
> - It's allowed for driver to make *few* special handling for the virtio
>user device. And that's what the driver supposed to do: to handle
>different device variants.

So here are two contradictory ways. Compared to the way you suggest,  
another way is to keep a unified driver and maintain all special 
handling inside struct virtio_pci_ops.

I prefer the latter because:
(1) Special handling for each kind of device will be gather together 
instead of scattered everywhere of driver code.
(2) It's more aligned to previous logic to hide the detail to 
differentiate modern/legacy device.


Thanks,
Jianfeng


>
>So, I think it's okay to export the virtio_user_device struct to
>driver and do all those kind of "fake pci" configration there.
>
>   --yliu
>
>> +.read_dev_cfg   = vdev_read_dev_config,
>> +.write_dev_cfg  = vdev_write_dev_config,
>> +.reset  = vdev_reset,
>> +.get_status = vdev_get_status,
>> +.set_status = vdev_set_status,
>> +.get_features   = vdev_get_features,
>> +.set_features   = vdev_set_features,
>> +.get_isr= vdev_get_isr,
>> +.set_config_irq = vdev_set_config_irq,
>> +.get_queue_num  = vdev_get_queue_num,
>> +.setup_queue= vdev_setup_queue,
>> +.del_queue  = vdev_del_queue,
>> +.notify_queue   = vdev_notify_queue,
>> +};
>> -- 
>> 2.1.4



[dpdk-dev] [PATCH] rte mempool: division or modulo by zero

2016-05-12 Thread Slawomir Mrozowicz
Fix issue reported by Coverity.

Coverity ID 13243: Division or modulo by zero
In function call rte_mempool_xmem_size, division by expression total_size
which may be zero has undefined behavior.

Fixes: 148f963fb532 ("xen: core library changes")

Signed-off-by: Slawomir Mrozowicz 
---
 lib/librte_mempool/rte_mempool.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index f8781e1..01668c1 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -327,15 +327,19 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t 
flags,
 size_t
 rte_mempool_xmem_size(uint32_t elt_num, size_t elt_sz, uint32_t pg_shift)
 {
-   size_t n, pg_num, pg_sz, sz;
+   size_t n, pg_num, pg_sz;
+   size_t sz = 0;

-   pg_sz = (size_t)1 << pg_shift;
+   if (elt_sz > 0) {
+   pg_sz = (size_t)1 << pg_shift;
+   n = pg_sz / elt_sz;

-   if ((n = pg_sz / elt_sz) > 0) {
-   pg_num = (elt_num + n - 1) / n;
-   sz = pg_num << pg_shift;
-   } else {
-   sz = RTE_ALIGN_CEIL(elt_sz, pg_sz) * elt_num;
+   if (n > 0) {
+   pg_num = (elt_num + n - 1) / n;
+   sz = pg_num << pg_shift;
+   } else {
+   sz = RTE_ALIGN_CEIL(elt_sz, pg_sz) * elt_num;
+   }
}

return sz;
-- 
1.9.1



[dpdk-dev] mbuff rearm_data aligmenet issue on non x86

2016-05-12 Thread Jerin Jacob
Hi All,

I would like align mbuff rearm_data field to 8 byte aligned so that
write to mbuf->rearm_data with uint64_t* will be naturally aligned.
I am not sure about IA but some other architecture/implementation has overhead
in non-naturally aligned stores.

Proposed patch is something like this below, But open for any change to
make fit for all other architectures/platform.

Any thoughts ?

? [master] [dpdk-master] $ git diff
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 529debb..5a917d0 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -733,10 +733,8 @@ struct rte_mbuf {
void *buf_addr;   /**< Virtual address of segment
buffer. */
phys_addr_t buf_physaddr; /**< Physical address of segment
buffer. */

-   uint16_t buf_len; /**< Length of segment buffer. */
-
/* next 6 bytes are initialised on RX descriptor rearm */
-   MARKER8 rearm_data;
+   MARKER64 rearm_data;
uint16_t data_off;

/**
@@ -754,6 +752,7 @@ struct rte_mbuf {
uint8_t nb_segs;  /**< Number of segments. */
uint8_t port; /**< Input port. */

+   uint16_t buf_len; /**< Length of segment buffer. */
uint64_t ol_flags;/**< Offload features. */

/* remaining bytes are set on RX when pulling packet from
 * descriptor 

/Jerin


[dpdk-dev] [PATCHv3 1/2] config/armv8a: disable igb_uio

2016-05-12 Thread Santosh Shukla
On Thu, May 12, 2016 at 01:54:13PM +0800, Jianbo Liu wrote:
> On 12 May 2016 at 13:06, Santosh Shukla
>  wrote:
> > On Thu, May 12, 2016 at 11:42:26AM +0800, Jianbo Liu wrote:
> >> On 12 May 2016 at 11:17, Santosh Shukla
> >>  wrote:
> >> > On Thu, May 12, 2016 at 10:01:05AM +0800, Jianbo Liu wrote:
> >> >> On 12 May 2016 at 02:25, Stephen Hemminger  >> >> networkplumber.org> wrote:
> >> >> > On Wed, 11 May 2016 22:32:16 +0530
> >> >> > Jerin Jacob  wrote:
> >> >> >
> >> >> >> On Wed, May 11, 2016 at 08:22:59AM -0700, Stephen Hemminger wrote:
> >> >> >> > On Wed, 11 May 2016 19:17:58 +0530
> >> >> >> > Hemant Agrawal  wrote:
> >> >> >> >
> >> >> >> > > IGB_UIO not supported for arm64 arch in kernel so disable.
> >> >> >> > >
> >> >> >> > > Signed-off-by: Hemant Agrawal 
> >> >> >> > > Reviewed-by: Santosh Shukla  >> >> >> > > caviumnetworks.com>
> >> >> >> >
> >> >> >> > Really, I have use IGB_UIO on ARM64
> >> >> >>
> >> >> >> May I know what is the technical use case for igb_uio on arm64
> >> >> >> which cannot be addressed through vfio or vfioionommu.
> >> >> >
> >> >> > I was running on older kernel which did not support vfioionommu mode.
> >> >>
> >> >> As I said, most of DPDK developers are not kernel developers. They may
> >> >> have their own kernel tree, and couldn't like to upgrade to latest
> >> >> kernel.
> >> >> They can choose to use or not use igb_uio when binding the driver. But
> >> >> blindly disabling it in the base config seems unreasonable.
> >> >
> >> > if user keeping his own kernel so they could also keep IGB_UIO=y in 
> >> > their local
> >> Most likely they don't have local dpdk tree. They write their own
> >> applications, complie and link to dpdk lib, then done.
> >>
> >> > dpdk tree. Why are you imposing user-x custome depedancy on upstream 
> >> > dpdk base
> >> Customer requiremnts is important. I want they can choose the way they 
> >> like.
> >>
> >
> > so you choose to keep igb_uio option, provided arch doesn't support?
> > new user did reported issues with igb_uio for arm64, refer this thread [1], 
> > as
> > well hemanth too faced issues. we want to avoid that.
> >
> > If customer maintaing out-of-tree kernel then he can also switch to 
> > vfio-way.
> > isn;t it?
> >
> >> > config. Is it not enough for explanation that - Base config ie.. armv8 
> >> > doesn;t
> >> > support pci mmap, so igb_uio is n/a. New user wont able to build/run 
> >> > dpdk/arm64
> >> > in igb_uio-way, He'll prefer to use upstream stuff. I think, you are not 
> >> > making
> >> You are wrong, he can build dpdk. If he like to use upstream without
> >> patching, he can use vfio.
> >
> > I disagree, we want to avoid [1] for new user.
> >
> >> But you can't ignore the need from old user which is more comfortable
> >> with older kernel.
> >>
> > arm/arm64 dpdk support recently added and I am guessing, most likely 
> > customer
> > using near latest kernel, switching to vfio won't be so difficult.
> >
> > Or can you take up responsibility of upstreaming pci mmap patch, then we 
> > don't
> > need this patch.
> >
> > [1] http://dpdk.org/ml/archives/dev/2016-January/031313.html
> 
> Can you read carefully about the guide at
> http://dpdk.org/doc/guides/linux_gsg/build_dpdk.html? It says to use
> uio_pci_generic, igb_uio or vfio-pci.

*** applicable and works for x86 only, not for arm64: because pci mmap support
not present for arm64, in that case we should update the doc.

> Could it be possible that the user in that thread has already read and
> tried them all and found that he can't enable vifo with his kernel,
> and igb_uio is the easy way for him and asked for help from community?
> If so, we have no choice but keeping igb_uio enabled.

By then vfionoiommu support was wip progress in dpdk/linux. but now it merged
and it works. So no need to retain igb_uio in base config for which to work -
user need to use mmap patch at linux side.

Or can you maintain out-of-tree pci mmap patch/ kerne source and make it
explicit somewhere in dpdk build doc that - if user want igb_uio way then
use kernel/mmap patch from x location.

> He use lsmod to show us the modules, most likely he know vifo-pci.
> 
> Below are the details on modules, hugepages and device binding.
> root at arm64:~# lsmod
> Module  Size  Used by
> rte_kni   292795  0
> igb_uio 4338  0
> ixgbe 184456  0


[dpdk-dev] [PATCHv3 1/2] config/armv8a: disable igb_uio

2016-05-12 Thread Jianbo Liu
On 12 May 2016 at 13:06, Santosh Shukla
 wrote:
> On Thu, May 12, 2016 at 11:42:26AM +0800, Jianbo Liu wrote:
>> On 12 May 2016 at 11:17, Santosh Shukla
>>  wrote:
>> > On Thu, May 12, 2016 at 10:01:05AM +0800, Jianbo Liu wrote:
>> >> On 12 May 2016 at 02:25, Stephen Hemminger > >> networkplumber.org> wrote:
>> >> > On Wed, 11 May 2016 22:32:16 +0530
>> >> > Jerin Jacob  wrote:
>> >> >
>> >> >> On Wed, May 11, 2016 at 08:22:59AM -0700, Stephen Hemminger wrote:
>> >> >> > On Wed, 11 May 2016 19:17:58 +0530
>> >> >> > Hemant Agrawal  wrote:
>> >> >> >
>> >> >> > > IGB_UIO not supported for arm64 arch in kernel so disable.
>> >> >> > >
>> >> >> > > Signed-off-by: Hemant Agrawal 
>> >> >> > > Reviewed-by: Santosh Shukla 
>> >> >> >
>> >> >> > Really, I have use IGB_UIO on ARM64
>> >> >>
>> >> >> May I know what is the technical use case for igb_uio on arm64
>> >> >> which cannot be addressed through vfio or vfioionommu.
>> >> >
>> >> > I was running on older kernel which did not support vfioionommu mode.
>> >>
>> >> As I said, most of DPDK developers are not kernel developers. They may
>> >> have their own kernel tree, and couldn't like to upgrade to latest
>> >> kernel.
>> >> They can choose to use or not use igb_uio when binding the driver. But
>> >> blindly disabling it in the base config seems unreasonable.
>> >
>> > if user keeping his own kernel so they could also keep IGB_UIO=y in their 
>> > local
>> Most likely they don't have local dpdk tree. They write their own
>> applications, complie and link to dpdk lib, then done.
>>
>> > dpdk tree. Why are you imposing user-x custome depedancy on upstream dpdk 
>> > base
>> Customer requiremnts is important. I want they can choose the way they like.
>>
>
> so you choose to keep igb_uio option, provided arch doesn't support?
> new user did reported issues with igb_uio for arm64, refer this thread [1], as
> well hemanth too faced issues. we want to avoid that.
>
> If customer maintaing out-of-tree kernel then he can also switch to vfio-way.
> isn;t it?
>
>> > config. Is it not enough for explanation that - Base config ie.. armv8 
>> > doesn;t
>> > support pci mmap, so igb_uio is n/a. New user wont able to build/run 
>> > dpdk/arm64
>> > in igb_uio-way, He'll prefer to use upstream stuff. I think, you are not 
>> > making
>> You are wrong, he can build dpdk. If he like to use upstream without
>> patching, he can use vfio.
>
> I disagree, we want to avoid [1] for new user.
>
>> But you can't ignore the need from old user which is more comfortable
>> with older kernel.
>>
> arm/arm64 dpdk support recently added and I am guessing, most likely customer
> using near latest kernel, switching to vfio won't be so difficult.
>
> Or can you take up responsibility of upstreaming pci mmap patch, then we don't
> need this patch.
>
> [1] http://dpdk.org/ml/archives/dev/2016-January/031313.html

Can you read carefully about the guide at
http://dpdk.org/doc/guides/linux_gsg/build_dpdk.html? It says to use
uio_pci_generic, igb_uio or vfio-pci.
Could it be possible that the user in that thread has already read and
tried them all and found that he can't enable vifo with his kernel,
and igb_uio is the easy way for him and asked for help from community?
If so, we have no choice but keeping igb_uio enabled.

He use lsmod to show us the modules, most likely he know vifo-pci.

Below are the details on modules, hugepages and device binding.
root at arm64:~# lsmod
Module  Size  Used by
rte_kni   292795  0
igb_uio 4338  0
ixgbe 184456  0


[dpdk-dev] mbuff rearm_data aligmenet issue on non x86

2016-05-12 Thread Ananyev, Konstantin
> 
> On Thu, May 12, 2016 at 10:07:09AM +, Ananyev, Konstantin wrote:
> > Hi Jerrin,
> >
> > >
> > > Hi All,
> > >
> > > I would like align mbuff rearm_data field to 8 byte aligned so that
> > > write to mbuf->rearm_data with uint64_t* will be naturally aligned.
> > > I am not sure about IA but some other architecture/implementation has 
> > > overhead
> > > in non-naturally aligned stores.
> > >
> > > Proposed patch is something like this below, But open for any change to
> > > make fit for all other architectures/platform.
> > >
> > > Any thoughts ?
> > >
> > > ? [master] [dpdk-master] $ git diff
> > > diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> > > index 529debb..5a917d0 100644
> > > --- a/lib/librte_mbuf/rte_mbuf.h
> > > +++ b/lib/librte_mbuf/rte_mbuf.h
> > > @@ -733,10 +733,8 @@ struct rte_mbuf {
> > > void *buf_addr;   /**< Virtual address of segment
> > > buffer. */
> > > phys_addr_t buf_physaddr; /**< Physical address of segment
> > > buffer. */
> > >
> > > -   uint16_t buf_len; /**< Length of segment buffer. */
> > > -
> >
> >
> > There is no need to move buf_len itself, I think.
> > Just move rearm_data marker prior to buf_len is enough.
> > Though how do you suggest to deal with the fact, that right now we blindly
> > update the whole 64bits pointed by rearm_data:
> >
> > drivers/net/ixgbe/ixgbe_rxtx_vec.c:
> > /*
> >  * Flush mbuf with pkt template.
> >  * Data to be rearmed is 6 bytes long.
> >  * Though, RX will overwrite ol_flags that are coming next
> >  * anyway. So overwrite whole 8 bytes with one load:
> >  * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
> >  */
> > p0 = (uintptr_t)>rearm_data;
> > *(uint64_t *)p0 = rxq->mbuf_initializer;
> >
> > ?
> >
> > If buf_len will be inside these 64bits, we can't do it anymore.
> >
> > Are you suggesting something like:
> >
> > uint64_t *p0, v0;
> >
> > p0 = >rearm_data;
> > v0 = *p0 & REARM_MASK;
> > *p0 = v0 | rxq->mbuf_initializer;
> > ?
> 
> Due to unaligned rearm_data issue, In ThunderX platform, we need to write
> multiple half word of aligned stores(so masking was better us).

Ok, so what would be the gain on ARM if you'll make that change?
Again, what would be the drop (if any) on IA?

> But I think, if we can put 16bit hole between port and ol_flags then
> we may not need the masking stuff in ixgbe. Right?

You mean move buf_len somewhere else (end of cacheline0) and 
introduce a 2B hole between port and ol_flags, right?
Yep, that probably wouldn't have any performance impact.

> 
> OR
> 
> Even better, if we can fill in a uint16_t variable which will replaced
> later in the flow like "data_len"?

data_len is grouped  with rx_descriptor_fields1 on purpose -
so we can update packet_type,pkt_len, data_len,vlan_tci,rss with one 16B write.

Konstantin

> and move buf_len at end the first  cache line? 
>or any other thoughts to fix unaligned rearm_data issue?
> 
> Jerin
> 
> 
> 
> >
> > If so I wonder what would be the performance impact of that change.
> > Konstantin
> >
> >
> > > /* next 6 bytes are initialised on RX descriptor rearm */
> > > -   MARKER8 rearm_data;
> > > +   MARKER64 rearm_data;
> > > uint16_t data_off;
> > >
> > > /**
> > > @@ -754,6 +752,7 @@ struct rte_mbuf {
> > > uint8_t nb_segs;  /**< Number of segments. */
> > > uint8_t port; /**< Input port. */
> > >
> > > +   uint16_t buf_len; /**< Length of segment buffer. */
> > > uint64_t ol_flags;/**< Offload features. */
> > >
> > > /* remaining bytes are set on RX when pulling packet from
> > >  * descriptor
> > >
> > > /Jerin


[dpdk-dev] virtio pmd failed in pci probing

2016-05-12 Thread Vincent Li
I add a debug log line in dpdk-16.04/lib/librte_ether/rte_ethdev.c
rte_eth_driver_register

void
rte_eth_driver_register(struct eth_driver *eth_drv)
{
eth_drv->pci_drv.devinit = rte_eth_dev_init;
eth_drv->pci_drv.devuninit = rte_eth_dev_uninit;
rte_eal_pci_register(_drv->pci_drv);
RTE_LOG(DEBUG, EAL, "  register pmd driver %s\n",
eth_drv->pci_drv.name);
}

then run the mTCP app, rte_virtio_pmd is missing:

EAL: WARNING: cpu flags constant_tsc=yes nonstop_tsc=no -> using
unreliable clock cycles !
EAL: Master lcore 0 is ready (tid=6ec31900;cpuset=[0])
EAL:   register pmd driver rte_vmxnet3_pmd
EAL:   register pmd driver rte_i40e_pmd
EAL:   register pmd driver rte_i40evf_pmd
EAL:   register pmd driver rte_ixgbe_pmd
EAL:   register pmd driver rte_ixgbevf_pmd
EAL:   register pmd driver rte_igb_pmd
EAL:   register pmd driver rte_igbvf_pmd
EAL:   register pmd driver rte_em_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_vmxnet3_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_i40e_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_i40evf_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_ixgbe_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_ixgbevf_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_igb_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_igbvf_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_em_pmd
pci_probe_all_drivers ret: 1 devargs not NULL and whitelisted
EAL: Error - exiting with code: 1
  Cause: No Ethernet port!

any clue?


On Thu, May 12, 2016 at 9:34 AM, Vincent Li  wrote:
> Hi
>
> I am testing mTCP https://github.com/eunyoung14/mtcp  with dpdk-16.04
> on KVM guest and it appears the virtio pmd fail to load, detail info
> below:
>
>
> # ./tools/dpdk_nic_bind.py --status
>
> Network devices using DPDK-compatible driver
> 
> :00:07.0 'Virtio network device' drv=igb_uio unused=
>
> Network devices using kernel driver
> ===
> :00:03.0 'Virtio network device' if= drv=virtio-pci unused=igb_uio
> :00:08.0 'Virtio network device' if= drv=virtio-pci unused=igb_uio
>
> in mtcp/src/io_module.c, I have (I hard code the whiltelist to make it
> easy to test for me):
>
>
>
>sprintf(whitelist, "%s", ":00:07.0");
>/* initialize the rte env first, what a waste of
> implementation effort!  */
> char *argv[] = {"",
> "-c",
> cpumaskbuf,
> "-w",
> whitelist,
> "-n",
> mem_channels,
> "--proc-type=auto",
> ""
> };
> const int argc = 8;
>
> /*
>  * re-set getopt extern variable optind.
>  * this issue was a bitch to debug
>  * rte_eal_init() internally uses getopt() syscall
>  * mtcp applications that also use an `external' getopt
>  * will cause a violent crash if optind is not reset to zero
>  * prior to calling the func below...
>  * see man getopt(3) for more details
>  */
> optind = 0;
>
> /* initialize the dpdk eal env */
> ret = rte_eal_init(argc, argv);
> if (ret < 0)
> rte_exit(EXIT_FAILURE, "Invalid EAL args!\n");
> /* give me the count of 'detected' ethernet ports */
> num_devices = rte_eth_dev_count();
> if (num_devices == 0) {
> rte_exit(EXIT_FAILURE, "No Ethernet port!\n");
> }
>
> in dpdk-16.04/lib/librte_eal/common/eal_common_pci.c
> rte_eal_pci_probe_one_driver, I added debug line below:
>
> /*
>  * If vendor/device ID match, call the devinit() function of the
>  * driver.
>  */
> static int
> rte_eal_pci_probe_one_driver(struct rte_pci_driver *dr, struct
> rte_pci_device *dev)
> {
> int ret;
> const struct rte_pci_id *id_table;
>
> RTE_LOG(DEBUG, EAL, "  dev->id.vendor_id:dev->id.device_id
> dr->name %x:%x %s\n", dev->id.vendor_id,
> dev->id.device_id, dr->name);
>
>
> when I run mTCP app as below, it says "No Ethernet port!", please note
> that the debug line did not show rte_virtio_pmd, why? is it because
> virtio pmd lack of vendor_id/device_id implementation?
>
>
> EAL: WARNING: cpu flags constant_tsc=yes nonstop_tsc=no -> using
> unreliable clock cycles !
> EAL: Master lcore 0 is ready (tid=dca0e900;cpuset=[0])
> EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_vmxnet3_pmd
> EAL:   

[dpdk-dev] [PATCHv3 1/2] config/armv8a: disable igb_uio

2016-05-12 Thread Jianbo Liu
On 12 May 2016 at 11:17, Santosh Shukla
 wrote:
> On Thu, May 12, 2016 at 10:01:05AM +0800, Jianbo Liu wrote:
>> On 12 May 2016 at 02:25, Stephen Hemminger  
>> wrote:
>> > On Wed, 11 May 2016 22:32:16 +0530
>> > Jerin Jacob  wrote:
>> >
>> >> On Wed, May 11, 2016 at 08:22:59AM -0700, Stephen Hemminger wrote:
>> >> > On Wed, 11 May 2016 19:17:58 +0530
>> >> > Hemant Agrawal  wrote:
>> >> >
>> >> > > IGB_UIO not supported for arm64 arch in kernel so disable.
>> >> > >
>> >> > > Signed-off-by: Hemant Agrawal 
>> >> > > Reviewed-by: Santosh Shukla 
>> >> >
>> >> > Really, I have use IGB_UIO on ARM64
>> >>
>> >> May I know what is the technical use case for igb_uio on arm64
>> >> which cannot be addressed through vfio or vfioionommu.
>> >
>> > I was running on older kernel which did not support vfioionommu mode.
>>
>> As I said, most of DPDK developers are not kernel developers. They may
>> have their own kernel tree, and couldn't like to upgrade to latest
>> kernel.
>> They can choose to use or not use igb_uio when binding the driver. But
>> blindly disabling it in the base config seems unreasonable.
>
> if user keeping his own kernel so they could also keep IGB_UIO=y in their 
> local
Most likely they don't have local dpdk tree. They write their own
applications, complie and link to dpdk lib, then done.

> dpdk tree. Why are you imposing user-x custome depedancy on upstream dpdk base
Customer requiremnts is important. I want they can choose the way they like.

> config. Is it not enough for explanation that - Base config ie.. armv8 doesn;t
> support pci mmap, so igb_uio is n/a. New user wont able to build/run 
> dpdk/arm64
> in igb_uio-way, He'll prefer to use upstream stuff. I think, you are not 
> making
You are wrong, he can build dpdk. If he like to use upstream without
patching, he can use vfio.
But you can't ignore the need from old user which is more comfortable
with older kernel.

> sense.
>


[dpdk-dev] releases scheduling

2016-05-12 Thread Thomas Monjalon
Hi everybody,

The deadlines for the next releases are available on the website:
http://dpdk.org/dev/roadmap#dates

Release 16.07
Proposal deadline: May 8
Integration deadline: June 16
Release: July 18
Release 16.11
Proposal deadline: August 28
Integration deadline: September 30
Release: November 2
Release 17.02
Release: February 1
Release 17.05
Release: May 2
Release 17.08
Release: August 1
Release 17.11
Release: November 2

The planning try to preserve some of the major holiday periods
(February, May, August, December).

Let's put more details for the next year:

Release 17.02
Proposal deadline: December 4
Integration deadline: January 5
Release: February 1
Release 17.05
Proposal deadline: February 26
Integration deadline: March 30
Release: May 2
Release 17.08
Proposal deadline: May 28
Integration deadline: June 29
Release: August 1

As usual, comments are welcome


[dpdk-dev] [RFC] mbuf: remove unused rx error flags

2016-05-12 Thread Olivier MATZ
Hi,

On 05/12/2016 03:32 AM, John Daley (johndale) wrote:
>> This patch removes the unused flags from rte_mbuf, and their use in
>> the drivers. The enic driver is modified to drop bad packets, but
>> i40e and fm10k are kept as they are today and should be fixed to
>> drop bad packets.
>
> Perhaps the change to enic to drop bad packets should be handled as a
> separate patch since it's not strictly related to not removing the
> use of the flags?

Yes, it's probably better to have it in a separate patch.

>> John, I did not test the patch on the enic driver, so please review
>> it carefully.
>>
>
> The patch works for enic, thanks. There are a few minor changes for
> performance: - put an unlikely in the if (packet_error) conditional -
> remove the if (!packet_error) conditional since it will always be
> true. Let me know if you would prefer I submit the enic patch
> separately.

I'll do it, thanks for reviewing.
I'll wait a bit for other comments before sending a first version of
the patchset.

Regards,
Olivier


[dpdk-dev] [PATCH] eal: fix log level/type retrieving on a standard pthread

2016-05-12 Thread Olivier MATZ
Hi Ferruh,

On 05/11/2016 06:39 PM, Ferruh Yigit wrote:
> On 5/9/2016 5:13 PM, Olivier Matz wrote:
>> --- a/lib/librte_eal/common/eal_common_log.c
>> +++ b/lib/librte_eal/common/eal_common_log.c
>> @@ -98,9 +98,10 @@ static int history_enabled = 1;
>>   struct log_cur_msg {
>>  uint32_t loglevel; /**< log level - see rte_log.h */
>>  uint32_t logtype;  /**< log type  - see rte_log.h */
>> -} __rte_cache_aligned;
>
> Removing alignment seems not related the main purpose of the patch. Is
> this intentional?

Initially, the structure was cache-aligned so each element of
the table was stored in a separate cache line, avoiding a lcore
accessing its element to polute its neighbors (this was by the
way a bit overkill as it's not a performance-sensitive structure).

Using a __thread variable instead of a table naturally removes this
need because it will be stored in a specific section containing only
per-core data.

> I have tested with custom code, non-EAL thread have lcore_id value
> UINT32_MAX, which is > RTE_MAX_LCORE and rte_log_cur_msg_loglevel gives
> default log level as described in commit log. With this patch each
> thread gets its own log level.
>
> Reviewed-by: Ferruh Yigit 
>

Thanks for reviewing and testing.

Regards,
Olivier


[dpdk-dev] [PATCH] lpm: unchecked return value

2016-05-12 Thread Azarewicz, PiotrX T
Hi,

I handle Coverity defect ID 13201. It is about unchecked return value from 
rte_lpm6_delete() instances in rte_lpm6_add() function.
Next I found this thread and I see that both defects (ID 13205 and ID 13201) 
may be resolved all together.

> >> Fix issue reported by Coverity.
> >>
> >> Coverity ID 13205: Unchecked return value Unchecked return value
> >> check_return: Calling rte_lpm6_add without checking return value
> >> Fixes: 5c510e13a9cb ("lpm: add IPv6 support")
> >>
> >> Signed-off-by: Slawomir Mrozowicz 
> >> ---
> >>  lib/librte_lpm/rte_lpm6.c | 10 ++
> >>  1 file changed, 6 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/lib/librte_lpm/rte_lpm6.c b/lib/librte_lpm/rte_lpm6.c
> >> index ba4353c..f4db3fa 100644
> >> --- a/lib/librte_lpm/rte_lpm6.c
> >> +++ b/lib/librte_lpm/rte_lpm6.c
> >> @@ -749,6 +749,7 @@ rte_lpm6_delete(struct rte_lpm6 *lpm, uint8_t
> >> *ip,
> >uint8_t depth)
> >>int32_t rule_to_delete_index;
> >>uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE];
> >>unsigned i;
> >> +  int status = 0;
> >>
> >>/*
> >> * Check input arguments.
> >> @@ -790,12 +791,13 @@ rte_lpm6_delete(struct rte_lpm6 *lpm, uint8_t
> >*ip, uint8_t depth)
> >> * Add every rule again (except for the one that was removed from
> >> * the rules table).
> >> */
> >> -  for (i = 0; i < lpm->used_rules; i++) {
> >> -  rte_lpm6_add(lpm, lpm->rules_tbl[i].ip, lpm-
> >>rules_tbl[i].depth,
> >> -  lpm->rules_tbl[i].next_hop);
> >> +  for (i = 0; i < lpm->used_rules && status >= 0; i++) {
> >> +  status = rte_lpm6_add(
> >> +  lpm, lpm->rules_tbl[i].ip, lpm->rules_tbl[i].depth,
> >> +  lpm->rules_tbl[i].next_hop);
> >>}
> >>
> >> -  return 0;
> >> +  return status;
> >>  }
> >
> >Hi,
> >
> >I'm not sure that this patch is actually necessary, as I'm not sure
> >that the lpm6_add calls can fail in this instance. Looking through the
> >code, this function deletes the rule and then clears the actual lpm
> >lookup tables before re-adding all other routes to it again. The only
> >error condition that could be returned, that I can see, is -ENOSPC,
> >which should never occur here since the original rules fitted in the first
> place.

I agree that -ENOSPC should never occur here. So rte_lpm6_add() instance should 
never fail here.

Next I looked at rte_lpm6_add() and if rte_lpm6_delete() instances in it may 
fail?
The only suspicious place that I found is place when add every rule again but 
that should work as discussed above.

> >
> >If it was possible to fail, then I think we would have a worse problem,
> >in that deleting a single rule has wiped out our lpm table and left it
> >in an inconsistent state, so the error handling probably needs to be better
> than just quitting.
> >
> >Finally, one other thing I spot looking through the code, is that there
> >seems to be a worrying set of calls between add and delete. If the add
> >function fails, then it calls delete which in turn will call add again,
> >etc. etc. This may all work correctly, but it seems fragile and error
> >prone to me - especially if we allow calls from one to another to fail.
> >
> >This looks like it might need some further examination to verify what
> >the possible failure cases are and what happens in each scenario.

I see no failure scenarios in here. I mean I see no possibility to create test 
that show that add function fail in del and opposite.
The only scenario what I have in my mind is that someone call add or/and del 
functions on different threads with the same lpm table instance, but this is 
not allowed, cause we know that this functions are not thread safe.

> >
> >Regards,
> >/Bruce
> 
> 
> Hi Bruce,
> 
> In my opinion the worst-case scenario should be take into account. If
> function like rte_lpm6_add() returns false then it should be handled.
> 
> Anyway I agree with you that if the function fail then we have serious
> problem.
> I see two problems:
> 1. Code construction: calls between function rte_lpm6_add() and
> rte_lpm6_delete(). As you said it should be examined.
> 2. How we should handle situation if the rules table are not reconstructed
> after delete operation.
> 
> I propose to add new issue in ClearQuest to proceed solve the problems
> because there are extend the original issue (CID 13205 Unchecked return
> value) from Coverity.
> 
> Regards,
> S?awomir

I propose to classify this Coverity issues (ID 13205 and ID 13201) as 
Intentional.

Regards,
Piotr


[dpdk-dev] [PATCH v2] eal/linuxapp: fix resource leak

2016-05-12 Thread Daniel Mrzyglod
Fix issue reported by Coverity.
Coverity ID 97920

munmap structure of hugepage

leaked_storage: Variable hugepage going out of scope leaks the storage
it points to.

The system resource will not be reclaimed and reused, reducing the future
availability of the resource.

In rte_eal_hugepage_init: Leak of memory or pointers to system resources

Fixes: b6a468ad41d5 ("memory: add --socket-mem option")

Signed-off-by: Daniel Mrzyglod 
---
 lib/librte_eal/linuxapp/eal/eal_memory.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..9fd0d8d 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1051,7 +1051,7 @@ int
 rte_eal_hugepage_init(void)
 {
struct rte_mem_config *mcfg;
-   struct hugepage_file *hugepage, *tmp_hp = NULL;
+   struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];

uint64_t memory[RTE_MAX_NUMA_NODES];
@@ -1367,13 +1367,15 @@ rte_eal_hugepage_init(void)
"of memory.\n",
i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
RTE_MAX_MEMSEG);
-   return -ENOMEM;
+   goto fail;
}

return 0;

 fail:
free(tmp_hp);
+   if (hugepage != NULL)
+   munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
return -1;
 }

-- 
2.5.5



[dpdk-dev] DPDK Community Call - 16.04 Retrospective - Wednesday May 11th

2016-05-12 Thread Glynn, Michael J
Meeting Minutes for Community Call - May 11th 2014

Topic: DPDK 16.04 Retrospective 
Facilitators: Mike Glynn (Intel), John McNamara (Intel)

Attendees: Christian Ehrhardt, Hemant Agrawal, Jan Viktorin, Mauricio Vasquez, 
Mike Holmes, Stephen Hemminger, Thomas Monjalon, Naoyuki Mori, Tom Gall, 
Konstantin Ananyev, Mike Glynn, John McNamara, Mohammad Abdul Awal, Bruce 
Richardson, Declan Doherty, Roy Fan Zhan, Ferruh Yigit, Bernard Iremonger, 
Reshma Pattan, Remy Horton, Vadim Sukhomlinov, Shanmukha Sreedhar Theerthala

Minutes:
*   IRC chat channel was opened up the discussion (see notes captured 
below). IRC Channel: http://webchat.freenode.net/?channels=%23dpdk
*   John McNamara reviewed some statistics from 16.04, which included;
o   Number of commits per release - averaging ~700
o   Number of contributors per release - ~100 for the past two releases
o   Unique "Reviewed-by" contributors is very low (7 in least release) 
o   Number of "fix" patches - 301 
o   Number of patches with a "Fixes:" line - has grown from 45 in DPDK2.0 
to 261 in 16.04 (attributed to greater compliance to using the "fixes" line)
o   Number of patchset revisions in v16.04 - vast majority at v1 (199) 
o   Days between Author Date and Commit Date - 50% of final patches are 
merged within 7 days, 75% of final patches are merged within 14 days, "Final" 
patch means the last version, when all v1..vN comments have been addressed

*   Discussion on Areas for Improvement:
o   John Mc: 'Reviewed by' numbers are low. Thomas: this is likely because 
people don't distinguish between 'reviewed by' and 'acked'. Agreed that there 
is still value in retaining the reviewed-by tag.
o   Stephen H: Need more active reviews in the community. Suggested that 
the Maintainers could delegate to others on the mailing list. Maintainers - 
please take note.
o   Thomas M: Having a large number of v1's being applied may not be a good 
thing...would prefer to see higher number of patch revisions since it shows 
community review process is working better. Reviews are the area which require 
the most improvement. 
o   Christian E: Packaging of DPDK - any community pushback to package 
something working almost everywhere but optimized where supported. No 
objections - Christian will post to the mailing list next week. 
http://udrepper.livejournal.com/20948.html 
o   Documentation gaps in (1) RTE table library, and (2) ACL need to be 
addressed. Please discussion on the mailing list, submit patches, or ping John 
McNamara (documentation maintainer)
o   Mike G: Is the RFC process working? There is usually very little 
feedback on RFC's. Thomas commented that RFC's are still useful, suggested 
using header file (with Doxygen comments) as an RFC format.

IRC Chat Notes:
[16:09] == reshmapa [c0c6972b at gateway/web/freenode/ip.192.198.151.43] has 
joined #DPDK
[16:16]  Not enough public reviews
[16:17]  Fixes: lines are increasing (good for maintenance)
[16:20]  A lot of patches are committed late in the cycle
[16:22]  just for the minutes - since the call mentioned some missing 
commit stats - If I didn't mistype that should be 2.2 -> 16.04 - Authors: 
http://paste.ubuntu.com/16363502/   Domains: http://paste.ubuntu.com/16363543/ 
[16:29] == nijopa [~nijopa at 72.246.0.14] has quit [Quit: Leaving.]
[16:33] == nijopa [~nijopa at 72.246.0.14] has joined #DPDK
[16:35]  not for minutes: audio quality is bad from some speakers - 
really hard to understand
[16:36] == yliu [yliu131 at nat/intel/x-phwergujuqevebif] has joined #DPDK
[16:39]  I wanted at least see how the idea at all reflects with the 
community - http://udrepper.livejournal.com/20948.html 
[16:39]  that would allow Distributions to package something working 
almost everywhere but optimized where supported
[16:39]  there were no hard opinions on it yet, I'll bring something 
to the mailing list (prob. next week)

Please feel free to reply if I missed something/captured incorrectly


[dpdk-dev] [PATCHv3 1/2] config/armv8a: disable igb_uio

2016-05-12 Thread Santosh Shukla
On Thu, May 12, 2016 at 11:42:26AM +0800, Jianbo Liu wrote:
> On 12 May 2016 at 11:17, Santosh Shukla
>  wrote:
> > On Thu, May 12, 2016 at 10:01:05AM +0800, Jianbo Liu wrote:
> >> On 12 May 2016 at 02:25, Stephen Hemminger  
> >> wrote:
> >> > On Wed, 11 May 2016 22:32:16 +0530
> >> > Jerin Jacob  wrote:
> >> >
> >> >> On Wed, May 11, 2016 at 08:22:59AM -0700, Stephen Hemminger wrote:
> >> >> > On Wed, 11 May 2016 19:17:58 +0530
> >> >> > Hemant Agrawal  wrote:
> >> >> >
> >> >> > > IGB_UIO not supported for arm64 arch in kernel so disable.
> >> >> > >
> >> >> > > Signed-off-by: Hemant Agrawal 
> >> >> > > Reviewed-by: Santosh Shukla 
> >> >> >
> >> >> > Really, I have use IGB_UIO on ARM64
> >> >>
> >> >> May I know what is the technical use case for igb_uio on arm64
> >> >> which cannot be addressed through vfio or vfioionommu.
> >> >
> >> > I was running on older kernel which did not support vfioionommu mode.
> >>
> >> As I said, most of DPDK developers are not kernel developers. They may
> >> have their own kernel tree, and couldn't like to upgrade to latest
> >> kernel.
> >> They can choose to use or not use igb_uio when binding the driver. But
> >> blindly disabling it in the base config seems unreasonable.
> >
> > if user keeping his own kernel so they could also keep IGB_UIO=y in their 
> > local
> Most likely they don't have local dpdk tree. They write their own
> applications, complie and link to dpdk lib, then done.
> 
> > dpdk tree. Why are you imposing user-x custome depedancy on upstream dpdk 
> > base
> Customer requiremnts is important. I want they can choose the way they like.
>

so you choose to keep igb_uio option, provided arch doesn't support?
new user did reported issues with igb_uio for arm64, refer this thread [1], as
well hemanth too faced issues. we want to avoid that. 

If customer maintaing out-of-tree kernel then he can also switch to vfio-way.
isn;t it?

> > config. Is it not enough for explanation that - Base config ie.. armv8 
> > doesn;t
> > support pci mmap, so igb_uio is n/a. New user wont able to build/run 
> > dpdk/arm64
> > in igb_uio-way, He'll prefer to use upstream stuff. I think, you are not 
> > making
> You are wrong, he can build dpdk. If he like to use upstream without
> patching, he can use vfio.

I disagree, we want to avoid [1] for new user.

> But you can't ignore the need from old user which is more comfortable
> with older kernel.
> 
arm/arm64 dpdk support recently added and I am guessing, most likely customer
using near latest kernel, switching to vfio won't be so difficult.

Or can you take up responsibility of upstreaming pci mmap patch, then we don't
need this patch.

[1] http://dpdk.org/ml/archives/dev/2016-January/031313.html
> > sense.
> >


[dpdk-dev] mbuff rearm_data aligmenet issue on non x86

2016-05-12 Thread Ananyev, Konstantin
Hi Jerrin,

> 
> Hi All,
> 
> I would like align mbuff rearm_data field to 8 byte aligned so that
> write to mbuf->rearm_data with uint64_t* will be naturally aligned.
> I am not sure about IA but some other architecture/implementation has overhead
> in non-naturally aligned stores.
> 
> Proposed patch is something like this below, But open for any change to
> make fit for all other architectures/platform.
> 
> Any thoughts ?
> 
> ? [master] [dpdk-master] $ git diff
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 529debb..5a917d0 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -733,10 +733,8 @@ struct rte_mbuf {
> void *buf_addr;   /**< Virtual address of segment
> buffer. */
> phys_addr_t buf_physaddr; /**< Physical address of segment
> buffer. */
> 
> -   uint16_t buf_len; /**< Length of segment buffer. */
> -


There is no need to move buf_len itself, I think.
Just move rearm_data marker prior to buf_len is enough.
Though how do you suggest to deal with the fact, that right now we blindly
update the whole 64bits pointed by rearm_data:

drivers/net/ixgbe/ixgbe_rxtx_vec.c:
/*
 * Flush mbuf with pkt template.
 * Data to be rearmed is 6 bytes long.
 * Though, RX will overwrite ol_flags that are coming next
 * anyway. So overwrite whole 8 bytes with one load:
 * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
 */
p0 = (uintptr_t)>rearm_data;
*(uint64_t *)p0 = rxq->mbuf_initializer;

?

If buf_len will be inside these 64bits, we can't do it anymore.

Are you suggesting something like:

uint64_t *p0, v0; 

p0 = >rearm_data;
v0 = *p0 & REARM_MASK;
*p0 = v0 | rxq->mbuf_initializer;
? 

If so I wonder what would be the performance impact of that change.
Konstantin


> /* next 6 bytes are initialised on RX descriptor rearm */
> -   MARKER8 rearm_data;
> +   MARKER64 rearm_data;
> uint16_t data_off;
> 
> /**
> @@ -754,6 +752,7 @@ struct rte_mbuf {
> uint8_t nb_segs;  /**< Number of segments. */
> uint8_t port; /**< Input port. */
> 
> +   uint16_t buf_len; /**< Length of segment buffer. */
> uint64_t ol_flags;/**< Offload features. */
> 
> /* remaining bytes are set on RX when pulling packet from
>  * descriptor
> 
> /Jerin


[dpdk-dev] [PATCHv3 1/2] config/armv8a: disable igb_uio

2016-05-12 Thread Jianbo Liu
On 12 May 2016 at 02:25, Stephen Hemminger  
wrote:
> On Wed, 11 May 2016 22:32:16 +0530
> Jerin Jacob  wrote:
>
>> On Wed, May 11, 2016 at 08:22:59AM -0700, Stephen Hemminger wrote:
>> > On Wed, 11 May 2016 19:17:58 +0530
>> > Hemant Agrawal  wrote:
>> >
>> > > IGB_UIO not supported for arm64 arch in kernel so disable.
>> > >
>> > > Signed-off-by: Hemant Agrawal 
>> > > Reviewed-by: Santosh Shukla 
>> >
>> > Really, I have use IGB_UIO on ARM64
>>
>> May I know what is the technical use case for igb_uio on arm64
>> which cannot be addressed through vfio or vfioionommu.
>
> I was running on older kernel which did not support vfioionommu mode.

As I said, most of DPDK developers are not kernel developers. They may
have their own kernel tree, and couldn't like to upgrade to latest
kernel.
They can choose to use or not use igb_uio when binding the driver. But
blindly disabling it in the base config seems unreasonable.


[dpdk-dev] [PATCH v3] examples/qos_meter: fix unchecked return value

2016-05-12 Thread Dumitrescu, Cristian


> -Original Message-
> From: Dumitrescu, Cristian
> Sent: Thursday, May 12, 2016 10:41 AM
> To: Mrozowicz, SlawomirX 
> Cc: dev at dpdk.org; Singh, Jasvinder 
> Subject: RE: [PATCH v3] examples/qos_meter: fix unchecked return value
> 
> 
> 
> > -Original Message-
> > From: Mrozowicz, SlawomirX
> > Sent: Thursday, May 12, 2016 8:06 AM
> > To: Dumitrescu, Cristian 
> > Cc: dev at dpdk.org; Singh, Jasvinder 
> > Subject: RE: [PATCH v3] examples/qos_meter: fix unchecked return value
> >
> >
> >
> > >-Original Message-
> > >From: Dumitrescu, Cristian
> > >Sent: Wednesday, May 11, 2016 7:57 PM
> > >To: Mrozowicz, SlawomirX 
> > >Cc: dev at dpdk.org; Singh, Jasvinder 
> > >Subject: RE: [PATCH v3] examples/qos_meter: fix unchecked return value
> > >
> > >
> > >
> > >> -Original Message-
> > >> From: Mrozowicz, SlawomirX
> > >> Sent: Wednesday, May 11, 2016 10:15 AM
> > >> To: Dumitrescu, Cristian 
> > >> Cc: dev at dpdk.org; Singh, Jasvinder 
> > >> Subject: RE: [PATCH v3] examples/qos_meter: fix unchecked return
> value
> > >>
> > >>
> > >>
> > >> >-Original Message-
> > >> >From: Dumitrescu, Cristian
> > >> >Sent: Tuesday, May 10, 2016 7:42 PM
> > >> >To: Mrozowicz, SlawomirX 
> > >> >Cc: dev at dpdk.org; Singh, Jasvinder 
> > >> >Subject: RE: [PATCH v3] examples/qos_meter: fix unchecked return
> > >> >value
> > >> >
> > >> >
> > >> >
> > >> >> -Original Message-
> > >> >> From: Mrozowicz, SlawomirX
> > >> >> Sent: Monday, May 9, 2016 9:38 AM
> > >> >> To: Dumitrescu, Cristian 
> > >> >> Cc: dev at dpdk.org; Singh, Jasvinder ;
> > >> >> Mrozowicz, SlawomirX 
> > >> >> Subject: [PATCH v3] examples/qos_meter: fix unchecked return
> value
> > >> >>
> > >> >> Fix issue reported by Coverity.
> > >> >>
> > >> >> Coverity ID 30693: Unchecked return value
> > >> >> check_return: Calling rte_meter_srtcm_config without checking
> > >> >> return value.
> > >> >>
> > >> >> Fixes: e6541fdec8b2 ("meter: initial import")
> > >> >>
> > >> >> Signed-off-by: Slawomir Mrozowicz
> > 
> > >> >> ---
> > >> >>  examples/qos_meter/main.c | 15 ++-
> > >> >> examples/qos_meter/main.h |  2 +-
> > >> >>  2 files changed, 11 insertions(+), 6 deletions(-)
> > >> >>
> > >> >> diff --git a/examples/qos_meter/main.c
> > b/examples/qos_meter/main.c
> > >> >> index b968b00..7c69606 100644
> > >> >> --- a/examples/qos_meter/main.c
> > >> >> +++ b/examples/qos_meter/main.c
> > >> >> @@ -133,14 +133,17 @@ struct rte_meter_trtcm_params
> > >> >app_trtcm_params[]
> > >> >> = {
> > >> >>
> > >> >>  FLOW_METER app_flows[APP_FLOWS_MAX];
> > >> >>
> > >> >> -static void
> > >> >> +static int
> > >> >>  app_configure_flow_table(void)
> > >> >>  {
> > >> >>   uint32_t i, j;
> > >> >> + int ret = 0;
> > >> >>
> > >> >> - for (i = 0, j = 0; i < APP_FLOWS_MAX; i ++, j = (j + 1) %
> > >> >> RTE_DIM(PARAMS)){
> > >> >> - FUNC_CONFIG(_flows[i], [j]);
> > >> >> - }
> > >> >> + for (i = 0, j = 0; i < APP_FLOWS_MAX && ret == 0;
> > >> >> + i ++, j = (j + 1) % RTE_DIM(PARAMS))
> > >> >> + ret = FUNC_CONFIG(_flows[i], [j]);
> > >> >> +
> > >> >> + return ret;
> > >> >>  }
> > >> >
> > >> >This is only returns the configuration status for the last flow and
> > >> >leaves undetected an error for any other flow. Why not check the
> > >> >status for each flow and return an error on first occurrence?
> > >> >For (...){ret = FUNC_CONFIG(...); if (ret) return ret;}
> > >> >
> > >>
> > >> This code check status of the function FUNC_CONFIG for each flow and
> > >> return an error on first occurrence. Rest of functions FUNC_CONFIG are
> > >> not called. See terminate condition of the loop.
> > >>
> > >
> > >Where is the status of FUNC_CONFIG checked exactly? I cannot see any
> > check
> > >in your code. I can only see returning the status code for the last call of
> this
> > >function in the for loop. I was expecting a check such as: if (ret) return 
> > >ret.
> > >
> >
> > Look at the loop terminate conditions:
> > i < APP_FLOWS_MAX && ret == 0
> > Program terminate the loop if the ret variable is differ then zero.
> > It means that program terminate if the last status of FUNC_CONFIG is an
> > error.
> 
> Yes, you're right, my bad, sorry.
> 

Actually, although the logic is correct, personally I find this code very 
cryptic / hard to read and more difficult to extend later if need be, can we 
please make it a bit more readable:

for (i = 0, j = 0; i < APP_FLOWS_MAX; i ++, j = (j + 1) % RTE_DIM(PARAMS)) {
ret = FUNC_CONFIG(_flows[i], [j]);
if (ret)
return ret;
}

return ret;

Thanks, Slawomir!

> >
> > >> >>
> > >> >>  static inline void
> > >> >> @@ -381,7 +384,9 @@ main(int argc, char **argv)
> > >> >>   rte_eth_promiscuous_enable(port_tx);
> > >> >>
> > >> >>   /* App configuration */
> > >> >> - app_configure_flow_table();
> > >> >> + ret = app_configure_flow_table();
> > >> >> + if (ret < 0)
> 

[dpdk-dev] Updates to large patchsets

2016-05-12 Thread Thomas Monjalon
2016-05-11 14:22, Stephen Hurd:
> When submitting an update to a single patch in a large patchset, what's the
> preferred method?  Do I send a single "[PATCH v2 01/40]" email with the
> modified patch or do I send the entire series of 40 patches again?

Good question :)
It's generally easier to understand if the whole series is sent.
If the series is large, it is better to wait to have several changes.
If you are almost sure there will be no more change except 1 or 2 small
ones, then you can send only these single patches.
Anyway, please keep patchwork status updated, i.e. mark old patches as
superseded.


[dpdk-dev] [PATCH v3] examples/qos_meter: fix unchecked return value

2016-05-12 Thread Dumitrescu, Cristian


> -Original Message-
> From: Mrozowicz, SlawomirX
> Sent: Thursday, May 12, 2016 8:06 AM
> To: Dumitrescu, Cristian 
> Cc: dev at dpdk.org; Singh, Jasvinder 
> Subject: RE: [PATCH v3] examples/qos_meter: fix unchecked return value
> 
> 
> 
> >-Original Message-
> >From: Dumitrescu, Cristian
> >Sent: Wednesday, May 11, 2016 7:57 PM
> >To: Mrozowicz, SlawomirX 
> >Cc: dev at dpdk.org; Singh, Jasvinder 
> >Subject: RE: [PATCH v3] examples/qos_meter: fix unchecked return value
> >
> >
> >
> >> -Original Message-
> >> From: Mrozowicz, SlawomirX
> >> Sent: Wednesday, May 11, 2016 10:15 AM
> >> To: Dumitrescu, Cristian 
> >> Cc: dev at dpdk.org; Singh, Jasvinder 
> >> Subject: RE: [PATCH v3] examples/qos_meter: fix unchecked return value
> >>
> >>
> >>
> >> >-Original Message-
> >> >From: Dumitrescu, Cristian
> >> >Sent: Tuesday, May 10, 2016 7:42 PM
> >> >To: Mrozowicz, SlawomirX 
> >> >Cc: dev at dpdk.org; Singh, Jasvinder 
> >> >Subject: RE: [PATCH v3] examples/qos_meter: fix unchecked return
> >> >value
> >> >
> >> >
> >> >
> >> >> -Original Message-
> >> >> From: Mrozowicz, SlawomirX
> >> >> Sent: Monday, May 9, 2016 9:38 AM
> >> >> To: Dumitrescu, Cristian 
> >> >> Cc: dev at dpdk.org; Singh, Jasvinder ;
> >> >> Mrozowicz, SlawomirX 
> >> >> Subject: [PATCH v3] examples/qos_meter: fix unchecked return value
> >> >>
> >> >> Fix issue reported by Coverity.
> >> >>
> >> >> Coverity ID 30693: Unchecked return value
> >> >> check_return: Calling rte_meter_srtcm_config without checking
> >> >> return value.
> >> >>
> >> >> Fixes: e6541fdec8b2 ("meter: initial import")
> >> >>
> >> >> Signed-off-by: Slawomir Mrozowicz
> 
> >> >> ---
> >> >>  examples/qos_meter/main.c | 15 ++-
> >> >> examples/qos_meter/main.h |  2 +-
> >> >>  2 files changed, 11 insertions(+), 6 deletions(-)
> >> >>
> >> >> diff --git a/examples/qos_meter/main.c
> b/examples/qos_meter/main.c
> >> >> index b968b00..7c69606 100644
> >> >> --- a/examples/qos_meter/main.c
> >> >> +++ b/examples/qos_meter/main.c
> >> >> @@ -133,14 +133,17 @@ struct rte_meter_trtcm_params
> >> >app_trtcm_params[]
> >> >> = {
> >> >>
> >> >>  FLOW_METER app_flows[APP_FLOWS_MAX];
> >> >>
> >> >> -static void
> >> >> +static int
> >> >>  app_configure_flow_table(void)
> >> >>  {
> >> >> uint32_t i, j;
> >> >> +   int ret = 0;
> >> >>
> >> >> -   for (i = 0, j = 0; i < APP_FLOWS_MAX; i ++, j = (j + 1) %
> >> >> RTE_DIM(PARAMS)){
> >> >> -   FUNC_CONFIG(_flows[i], [j]);
> >> >> -   }
> >> >> +   for (i = 0, j = 0; i < APP_FLOWS_MAX && ret == 0;
> >> >> +   i ++, j = (j + 1) % RTE_DIM(PARAMS))
> >> >> +   ret = FUNC_CONFIG(_flows[i], [j]);
> >> >> +
> >> >> +   return ret;
> >> >>  }
> >> >
> >> >This is only returns the configuration status for the last flow and
> >> >leaves undetected an error for any other flow. Why not check the
> >> >status for each flow and return an error on first occurrence?
> >> >For (...){ret = FUNC_CONFIG(...); if (ret) return ret;}
> >> >
> >>
> >> This code check status of the function FUNC_CONFIG for each flow and
> >> return an error on first occurrence. Rest of functions FUNC_CONFIG are
> >> not called. See terminate condition of the loop.
> >>
> >
> >Where is the status of FUNC_CONFIG checked exactly? I cannot see any
> check
> >in your code. I can only see returning the status code for the last call of 
> >this
> >function in the for loop. I was expecting a check such as: if (ret) return 
> >ret.
> >
> 
> Look at the loop terminate conditions:
> i < APP_FLOWS_MAX && ret == 0
> Program terminate the loop if the ret variable is differ then zero.
> It means that program terminate if the last status of FUNC_CONFIG is an
> error.

Yes, you're right, my bad, sorry.

> 
> >> >>
> >> >>  static inline void
> >> >> @@ -381,7 +384,9 @@ main(int argc, char **argv)
> >> >> rte_eth_promiscuous_enable(port_tx);
> >> >>
> >> >> /* App configuration */
> >> >> -   app_configure_flow_table();
> >> >> +   ret = app_configure_flow_table();
> >> >> +   if (ret < 0)
> >> >> +   rte_exit(EXIT_FAILURE, "Invalid configure flow 
> >> >> table\n");
> >> >>
> >> >> /* Launch per-lcore init on every lcore */
> >> >> rte_eal_mp_remote_launch(main_loop, NULL, CALL_MASTER); diff -
> >> -
> >> >git
> >> >> a/examples/qos_meter/main.h b/examples/qos_meter/main.h index
> >> >> 530bf69..54867dc 100644
> >> >> --- a/examples/qos_meter/main.h
> >> >> +++ b/examples/qos_meter/main.h
> >> >> @@ -51,7 +51,7 @@ enum policer_action
> >> >> policer_table[e_RTE_METER_COLORS][e_RTE_METER_COLORS] =  #if
> >> >APP_MODE
> >> >> == APP_MODE_FWD
> >> >>
> >> >>  #define FUNC_METER(a,b,c,d) color, flow_id=flow_id,
> >> >> pkt_len=pkt_len, time=time -#define FUNC_CONFIG(a,b)
> >> >> +#define FUNC_CONFIG(a, b) 0
> >> >>  #define PARAMS app_srtcm_params
> >> >>  #define FLOW_METER int
> >> >>
> >> >> --
> >> >> 

[dpdk-dev] [PATCH v4 6/8] virtio-user: add new virtual pci driver for virtio

2016-05-12 Thread Yuanhan Liu
On Thu, May 12, 2016 at 03:08:05PM +0800, Tan, Jianfeng wrote:
> >>+static int
> >>+vdev_setup_queue(struct virtio_hw *hw __rte_unused, struct virtqueue *vq)
> >>+{
> >>+   /* Changed to use virtual addr */
> >>+   vq->vq_ring_mem = (phys_addr_t)vq->mz->addr;
> >>+   if (vq->virtio_net_hdr_mz) {
> >>+   vq->virtio_net_hdr_mem =
> >>+   (phys_addr_t)vq->virtio_net_hdr_mz->addr;
> >>+   /* Do it one more time after we reset virtio_net_hdr_mem */
> >>+   vring_hdr_desc_init(vq);
> >>+   }
> >>+   vq->offset = offsetof(struct rte_mbuf, buf_addr);
> >>+   return 0;
> >Here as last email said, you should not mix vq stuff. What's more,
> >why do you invoke vring_hdr_desc_init() here?
> 
> vring_hdr_desc_init() is to init header desc according to
> vq->virtio_net_hdr_mem, and here we change to use virtual address, so we
> need to invoke this after vq->virtio_net_hdr_mem is decided.
> 
> But for this case, you remind me that we can achieve that by: inside
> virtio_dev_queue_setup(), move vring_hdr_desc_init() after setup_queue().
> 
> >If it needs a special
> >handling, do it in driver.
> 
> As discussed in previous mail with David, we should hide special handling
> inside pci ops,

Generally speaking, yes.

> such as real virtio device needs to check address (patch 1).

And that's a good one: I've already acked. But here, I doubt it introduces 
any benefits to do that. Firstly, it's a Tx queue specific setting, moving
it to common code path means you have to add a check like the way you did
in this patch. BTW, it's an implicit check, which hurts readability a bit.

Secondly, you have to do this kind of check/settings in 3 different places: 

- legacy queue_setup() method
- modern queue_setup() method
- your vdev queue_setup() method

And another remind is that Huawei planned to split Rx/Tx queue settings,
here you mixed them again, and I don't think Huawei would like it. Don't
even to say that after the split, the Tx specific stuff will be no longer
in the common vq structure.

So, I'd suggest something like following:

if (is_vdev(..)) {
/* comment here that we use VA for vdev */
vq->vq_ring_mem = (phys_addr_t)vq->mz->addr;
vq->virtio_net_hdr_mem = ...;
vq->offset = ...;
} else {
vq->vq_ring_mem = ...;
...
}
vring_hdr_desc_init(vq);

> See below comments for more detail.
> 
> >
> >The "setup_queue" method is actually for telling the device where desc,
> >avail and used vring are located. Hence, the implementation could be simple:
> >just log them.
> 
> >
> >>+
> >>+const struct virtio_pci_ops vdev_ops = {
> >Note that this is the interface for the driver to talk to the device,
> >we should put this file into upper layer then, in the driver.
> >
> >And let me make a summary, trying to make it clear:
> >
> >- We should not use any structures/functions from the virtio driver
> >   here, unless it's really a must.
> 
> Firstly I agree this point (although I see a difference in how we take "a
> must"). My original principle is to maximize the use of existing structures
> instead of maintain any new ones.

If that could save you a lot of efforts and make the design clean, I
might would say, yes, go for it. But it's obviously NO in this case.

> And I already give up that principle when
> I accept your previous suggestion to use struct virtio_user_device to store
> virtio-user specific fields. So I agree to add the new struct virtqueue to
> avoid use of driver-layer virtqueues.
> 
> >
> >- It's allowed for driver to make *few* special handling for the virtio
> >   user device. And that's what the driver supposed to do: to handle
> >   different device variants.
> 
> So here are two contradictory ways. Compared to the way you suggest,
> another way is to keep a unified driver and maintain all special handling
> inside struct virtio_pci_ops.
> 
> I prefer the latter because:
> (1) Special handling for each kind of device will be gather together instead
> of scattered everywhere of driver code.
> (2) It's more aligned to previous logic to hide the detail to differentiate
> modern/legacy device.

May I ask how many more such handling are needed, excluding the tx queue
header desc setup? And as stated, in generic, yes, we should try that.

--yliu


[dpdk-dev] virtio pmd failed in pci probing

2016-05-12 Thread Vincent Li
Hi

I am testing mTCP https://github.com/eunyoung14/mtcp  with dpdk-16.04
on KVM guest and it appears the virtio pmd fail to load, detail info
below:


# ./tools/dpdk_nic_bind.py --status

Network devices using DPDK-compatible driver

:00:07.0 'Virtio network device' drv=igb_uio unused=

Network devices using kernel driver
===
:00:03.0 'Virtio network device' if= drv=virtio-pci unused=igb_uio
:00:08.0 'Virtio network device' if= drv=virtio-pci unused=igb_uio

in mtcp/src/io_module.c, I have (I hard code the whiltelist to make it
easy to test for me):



   sprintf(whitelist, "%s", ":00:07.0");
   /* initialize the rte env first, what a waste of
implementation effort!  */
char *argv[] = {"",
"-c",
cpumaskbuf,
"-w",
whitelist,
"-n",
mem_channels,
"--proc-type=auto",
""
};
const int argc = 8;

/*
 * re-set getopt extern variable optind.
 * this issue was a bitch to debug
 * rte_eal_init() internally uses getopt() syscall
 * mtcp applications that also use an `external' getopt
 * will cause a violent crash if optind is not reset to zero
 * prior to calling the func below...
 * see man getopt(3) for more details
 */
optind = 0;

/* initialize the dpdk eal env */
ret = rte_eal_init(argc, argv);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Invalid EAL args!\n");
/* give me the count of 'detected' ethernet ports */
num_devices = rte_eth_dev_count();
if (num_devices == 0) {
rte_exit(EXIT_FAILURE, "No Ethernet port!\n");
}

in dpdk-16.04/lib/librte_eal/common/eal_common_pci.c
rte_eal_pci_probe_one_driver, I added debug line below:

/*
 * If vendor/device ID match, call the devinit() function of the
 * driver.
 */
static int
rte_eal_pci_probe_one_driver(struct rte_pci_driver *dr, struct
rte_pci_device *dev)
{
int ret;
const struct rte_pci_id *id_table;

RTE_LOG(DEBUG, EAL, "  dev->id.vendor_id:dev->id.device_id
dr->name %x:%x %s\n", dev->id.vendor_id,
dev->id.device_id, dr->name);


when I run mTCP app as below, it says "No Ethernet port!", please note
that the debug line did not show rte_virtio_pmd, why? is it because
virtio pmd lack of vendor_id/device_id implementation?


EAL: WARNING: cpu flags constant_tsc=yes nonstop_tsc=no -> using
unreliable clock cycles !
EAL: Master lcore 0 is ready (tid=dca0e900;cpuset=[0])
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_vmxnet3_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_i40e_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_i40evf_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_ixgbe_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_ixgbevf_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_igb_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_igbvf_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_em_pmd
pci_probe_all_drivers ret: 1 devargs not NULL and whitelisted
EAL: Error - exiting with code: 1
  Cause: No Ethernet port!


dpdk testpmd app detects the ethernet port fine, here is the testpmd
output (Note here the debug line shows rte_virtio_pmd):

./x86_64-native-linuxapp-gcc/app/testpmd -c 0xf -n 4  -w :00:07.0 -- -i

EAL: Master lcore 0 is ready (tid=e9c98900;cpuset=[0])
EAL: lcore 3 is ready (tid=a69f6700;cpuset=[3])
EAL: lcore 2 is ready (tid=a71f7700;cpuset=[2])
EAL: lcore 1 is ready (tid=a79f8700;cpuset=[1])
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000 rte_vmxnet3_pmd
EAL:   dev->id.vendor_id:dev->id.device_id dr->name 1af4:1000
rte_virtio_pmd
EAL: PCI device :00:07.0 on NUMA socket -1
EAL:   probe driver: 1af4:1000 rte_virtio_pmd
EAL:   PCI memory mapped at 0x7fabe8a0


I am able to workaround this by compiling dpdk as shared library for
mTCP and use '-d' to load librte_pmd_virtio.so explicitly.

it looks to me there isn't much difference between how testpmd and
mTCP invokes dpdk, it is straightforward rte_eal_init(argc, argv) and
rte_eth_dev_count()

is there any potential bug in the implementation of virtio pmd in pci probing ?

Thanks!


[dpdk-dev] [PATCH] eal/linuxapp: fix resource leak

2016-05-12 Thread Sergio Gonzalez Monroy
Hi,

On 11/05/2016 17:01, Daniel Mrzyglod wrote:
> Fix issue reported by Coverity.
> Coverity ID 97920
>
> munmap structure of hugepage
>
> leaked_storage: Variable hugepage going out of scope leaks the storage
> it points to.
>
> The system resource will not be reclaimed and reused, reducing the future
> availability of the resource.

I'm not really fond of this commit messages, but if no one minds them, 
so be it.

> In rte_eal_hugepage_init: Leak of memory or pointers to system resources
>
> Fixes: b6a468ad41d5 ("memory: add --socket-mem option")
>
> Signed-off-by: Daniel Mrzyglod 
> ---
>   lib/librte_eal/linuxapp/eal/eal_memory.c | 4 +++-
>   1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
> b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index 5b9132c..cd40cc9 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -1051,7 +1051,7 @@ int
>   rte_eal_hugepage_init(void)
>   {
>   struct rte_mem_config *mcfg;
> - struct hugepage_file *hugepage, *tmp_hp = NULL;
> + struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
>   struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
>   
>   uint64_t memory[RTE_MAX_NUMA_NODES];
> @@ -1374,6 +1374,8 @@ rte_eal_hugepage_init(void)
>   
>   fail:
>   free(tmp_hp);
> + if (hugepage != NULL)
> + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
>   return -1;
>   }
>   

You missed the last conditional if(...0 of the function where it returns 
directly with error value.

Looking at the code a bit, we never check for anything other than < 0 
return value, so I'd just do a 'goto fail' instead.

Sergio


[dpdk-dev] [PATCHv3 1/2] config/armv8a: disable igb_uio

2016-05-12 Thread Santosh Shukla
On Thu, May 12, 2016 at 10:01:05AM +0800, Jianbo Liu wrote:
> On 12 May 2016 at 02:25, Stephen Hemminger  
> wrote:
> > On Wed, 11 May 2016 22:32:16 +0530
> > Jerin Jacob  wrote:
> >
> >> On Wed, May 11, 2016 at 08:22:59AM -0700, Stephen Hemminger wrote:
> >> > On Wed, 11 May 2016 19:17:58 +0530
> >> > Hemant Agrawal  wrote:
> >> >
> >> > > IGB_UIO not supported for arm64 arch in kernel so disable.
> >> > >
> >> > > Signed-off-by: Hemant Agrawal 
> >> > > Reviewed-by: Santosh Shukla 
> >> >
> >> > Really, I have use IGB_UIO on ARM64
> >>
> >> May I know what is the technical use case for igb_uio on arm64
> >> which cannot be addressed through vfio or vfioionommu.
> >
> > I was running on older kernel which did not support vfioionommu mode.
> 
> As I said, most of DPDK developers are not kernel developers. They may
> have their own kernel tree, and couldn't like to upgrade to latest
> kernel.
> They can choose to use or not use igb_uio when binding the driver. But
> blindly disabling it in the base config seems unreasonable.

if user keeping his own kernel so they could also keep IGB_UIO=y in their local
dpdk tree. Why are you imposing user-x custome depedancy on upstream dpdk base
config. Is it not enough for explanation that - Base config ie.. armv8 doesn;t
support pci mmap, so igb_uio is n/a. New user wont able to build/run dpdk/arm64
in igb_uio-way, He'll prefer to use upstream stuff. I think, you are not making
sense.



[dpdk-dev] [PATCHv3 1/2] config/armv8a: disable igb_uio

2016-05-12 Thread Jerin Jacob
On Wed, May 11, 2016 at 11:25:59AM -0700, Stephen Hemminger wrote:
> On Wed, 11 May 2016 22:32:16 +0530
> Jerin Jacob  wrote:
> 
> > On Wed, May 11, 2016 at 08:22:59AM -0700, Stephen Hemminger wrote:
> > > On Wed, 11 May 2016 19:17:58 +0530
> > > Hemant Agrawal  wrote:
> > > 
> > > > IGB_UIO not supported for arm64 arch in kernel so disable.
> > > > 
> > > > Signed-off-by: Hemant Agrawal 
> > > > Reviewed-by: Santosh Shukla 
> > > 
> > > Really, I have use IGB_UIO on ARM64
> > 
> > May I know what is the technical use case for igb_uio on arm64
> > which cannot be addressed through vfio or vfioionommu.
> 
> I was running on older kernel which did not support vfioionommu mode.

That way if we see older and latest kernel does not have ibg_uio(due to
sysfs mmap issue) support .If you are back-porting the changes
I recommend to back port vfioionommu changes to old kernel.

If it comes to out of tree then dpdk out of tree configuration can also set
CONFIG_RTE_EAL_IGB_UIO or even while configuring dpdk.

IMO, It is better to keep arm64 dpdk.org changes inline with
upstream arm64 linux kernel changes.

What do you think?


[dpdk-dev] [PATCH v3] examples/qos_meter: fix unchecked return value

2016-05-12 Thread Mrozowicz, SlawomirX


>-Original Message-
>From: Dumitrescu, Cristian
>Sent: Wednesday, May 11, 2016 7:57 PM
>To: Mrozowicz, SlawomirX 
>Cc: dev at dpdk.org; Singh, Jasvinder 
>Subject: RE: [PATCH v3] examples/qos_meter: fix unchecked return value
>
>
>
>> -Original Message-
>> From: Mrozowicz, SlawomirX
>> Sent: Wednesday, May 11, 2016 10:15 AM
>> To: Dumitrescu, Cristian 
>> Cc: dev at dpdk.org; Singh, Jasvinder 
>> Subject: RE: [PATCH v3] examples/qos_meter: fix unchecked return value
>>
>>
>>
>> >-Original Message-
>> >From: Dumitrescu, Cristian
>> >Sent: Tuesday, May 10, 2016 7:42 PM
>> >To: Mrozowicz, SlawomirX 
>> >Cc: dev at dpdk.org; Singh, Jasvinder 
>> >Subject: RE: [PATCH v3] examples/qos_meter: fix unchecked return
>> >value
>> >
>> >
>> >
>> >> -Original Message-
>> >> From: Mrozowicz, SlawomirX
>> >> Sent: Monday, May 9, 2016 9:38 AM
>> >> To: Dumitrescu, Cristian 
>> >> Cc: dev at dpdk.org; Singh, Jasvinder ;
>> >> Mrozowicz, SlawomirX 
>> >> Subject: [PATCH v3] examples/qos_meter: fix unchecked return value
>> >>
>> >> Fix issue reported by Coverity.
>> >>
>> >> Coverity ID 30693: Unchecked return value
>> >> check_return: Calling rte_meter_srtcm_config without checking
>> >> return value.
>> >>
>> >> Fixes: e6541fdec8b2 ("meter: initial import")
>> >>
>> >> Signed-off-by: Slawomir Mrozowicz 
>> >> ---
>> >>  examples/qos_meter/main.c | 15 ++-
>> >> examples/qos_meter/main.h |  2 +-
>> >>  2 files changed, 11 insertions(+), 6 deletions(-)
>> >>
>> >> diff --git a/examples/qos_meter/main.c b/examples/qos_meter/main.c
>> >> index b968b00..7c69606 100644
>> >> --- a/examples/qos_meter/main.c
>> >> +++ b/examples/qos_meter/main.c
>> >> @@ -133,14 +133,17 @@ struct rte_meter_trtcm_params
>> >app_trtcm_params[]
>> >> = {
>> >>
>> >>  FLOW_METER app_flows[APP_FLOWS_MAX];
>> >>
>> >> -static void
>> >> +static int
>> >>  app_configure_flow_table(void)
>> >>  {
>> >>   uint32_t i, j;
>> >> + int ret = 0;
>> >>
>> >> - for (i = 0, j = 0; i < APP_FLOWS_MAX; i ++, j = (j + 1) %
>> >> RTE_DIM(PARAMS)){
>> >> - FUNC_CONFIG(_flows[i], [j]);
>> >> - }
>> >> + for (i = 0, j = 0; i < APP_FLOWS_MAX && ret == 0;
>> >> + i ++, j = (j + 1) % RTE_DIM(PARAMS))
>> >> + ret = FUNC_CONFIG(_flows[i], [j]);
>> >> +
>> >> + return ret;
>> >>  }
>> >
>> >This is only returns the configuration status for the last flow and
>> >leaves undetected an error for any other flow. Why not check the
>> >status for each flow and return an error on first occurrence?
>> >For (...){ret = FUNC_CONFIG(...); if (ret) return ret;}
>> >
>>
>> This code check status of the function FUNC_CONFIG for each flow and
>> return an error on first occurrence. Rest of functions FUNC_CONFIG are
>> not called. See terminate condition of the loop.
>>
>
>Where is the status of FUNC_CONFIG checked exactly? I cannot see any check
>in your code. I can only see returning the status code for the last call of 
>this
>function in the for loop. I was expecting a check such as: if (ret) return ret.
>

Look at the loop terminate conditions:
i < APP_FLOWS_MAX && ret == 0
Program terminate the loop if the ret variable is differ then zero.
It means that program terminate if the last status of FUNC_CONFIG is an error.

>> >>
>> >>  static inline void
>> >> @@ -381,7 +384,9 @@ main(int argc, char **argv)
>> >>   rte_eth_promiscuous_enable(port_tx);
>> >>
>> >>   /* App configuration */
>> >> - app_configure_flow_table();
>> >> + ret = app_configure_flow_table();
>> >> + if (ret < 0)
>> >> + rte_exit(EXIT_FAILURE, "Invalid configure flow table\n");
>> >>
>> >>   /* Launch per-lcore init on every lcore */
>> >>   rte_eal_mp_remote_launch(main_loop, NULL, CALL_MASTER); diff -
>> -
>> >git
>> >> a/examples/qos_meter/main.h b/examples/qos_meter/main.h index
>> >> 530bf69..54867dc 100644
>> >> --- a/examples/qos_meter/main.h
>> >> +++ b/examples/qos_meter/main.h
>> >> @@ -51,7 +51,7 @@ enum policer_action
>> >> policer_table[e_RTE_METER_COLORS][e_RTE_METER_COLORS] =  #if
>> >APP_MODE
>> >> == APP_MODE_FWD
>> >>
>> >>  #define FUNC_METER(a,b,c,d) color, flow_id=flow_id,
>> >> pkt_len=pkt_len, time=time -#define FUNC_CONFIG(a,b)
>> >> +#define FUNC_CONFIG(a, b) 0
>> >>  #define PARAMS   app_srtcm_params
>> >>  #define FLOW_METER int
>> >>
>> >> --
>> >> 1.9.1



[dpdk-dev] [RFC] mbuf: remove unused rx error flags

2016-05-12 Thread John Daley (johndale)
Hi,

> -Original Message-
> From: Olivier Matz [mailto:olivier.matz at 6wind.com]
> Sent: Tuesday, May 10, 2016 1:40 AM
> To: dev at dpdk.org
> Cc: konstantin.ananyev at intel.com; John Daley (johndale)
> ; helin.zhang at intel.com; arnon at qwilt.com;
> rolette at infinite.io
> Subject: [RFC] mbuf: remove unused rx error flags
> 
> Following the discussions from:
> http://dpdk.org/ml/archives/dev/2015-July/021721.html
> http://dpdk.org/ml/archives/dev/2016-April/038143.html
> 
> The value of these flags is 0, making them useless. Today, no example
> application checks them on RX, and only few drivers sets them, and silently
> give wrong packets to the application, which should not happen.
> 
> This patch removes the unused flags from rte_mbuf, and their use in the
> drivers. The enic driver is modified to drop bad packets, but i40e and fm10k
> are kept as they are today and should be fixed to drop bad packets.

Perhaps the change to enic to drop bad packets should be handled as a separate 
patch since it's not strictly related to not removing the use of the flags?

> 
> Fixes: c22265f6 ("mbuf: add new packet flags for i40e")
> Signed-off-by: Olivier Matz 
> ---
> 
> Hi,
> 
> Here is a draft patch that removes the rx mbuf flags, as discussed.
> 
> John, I did not test the patch on the enic driver, so please review it 
> carefully.
> 

The patch works for enic, thanks. There are a few minor changes for performance:
 - put an unlikely in the if (packet_error) conditional
- remove the if (!packet_error) conditional since it will always be true.
Let me know if you would prefer I submit the enic patch separately.

> Comments are welcome.
> 
> Olivier
> 
> 
>  drivers/net/enic/enic_rx.c | 31 ++-
>  drivers/net/fm10k/fm10k_rxtx.c | 16 
>  drivers/net/fm10k/fm10k_rxtx_vec.c |  2 +-
>  drivers/net/i40e/i40e_rxtx.c   | 15 ---
>  lib/librte_mbuf/rte_mbuf.c |  4 
>  lib/librte_mbuf/rte_mbuf.h |  5 +
>  6 files changed, 16 insertions(+), 57 deletions(-)
> 
> diff --git a/drivers/net/enic/enic_rx.c b/drivers/net/enic/enic_rx.c index
> b3ad9ea..bad802e 100644
> --- a/drivers/net/enic/enic_rx.c
> +++ b/drivers/net/enic/enic_rx.c
> @@ -144,20 +144,15 @@ enic_cq_rx_desc_n_bytes(struct cq_desc *cqd)  }
> 
>  static inline uint8_t
> -enic_cq_rx_to_pkt_err_flags(struct cq_desc *cqd, uint64_t
> *pkt_err_flags_out)
> +enic_cq_rx_check_err(struct cq_desc *cqd)
>  {
>   struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
>   uint16_t bwflags;
> - int ret = 0;
> - uint64_t pkt_err_flags = 0;
> 
>   bwflags = enic_cq_rx_desc_bwflags(cqrd);
> - if (unlikely(enic_cq_rx_desc_packet_error(bwflags))) {
> - pkt_err_flags = PKT_RX_MAC_ERR;
> - ret = 1;
> - }
> - *pkt_err_flags_out = pkt_err_flags;
> - return ret;
> + if (unlikely(enic_cq_rx_desc_packet_error(bwflags)))
> + return 1;
> + return 0;
>  }
> 
>  /*
> @@ -253,7 +248,7 @@ enic_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts,
>   struct enic *enic = vnic_dev_priv(rq->vdev);
>   unsigned int rx_id;
>   struct rte_mbuf *nmb, *rxmb;
> - uint16_t nb_rx = 0;
> + uint16_t nb_rx = 0, nb_err = 0;
>   uint16_t nb_hold;
>   struct vnic_cq *cq;
>   volatile struct cq_desc *cqd_ptr;
> @@ -269,7 +264,6 @@ enic_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts,
>   volatile struct rq_enet_desc *rqd_ptr;
>   dma_addr_t dma_addr;
>   struct cq_desc cqd;
> - uint64_t ol_err_flags;
>   uint8_t packet_error;
> 
>   /* Check for pkts available */
> @@ -293,7 +287,7 @@ enic_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts,
>   }
> 
>   /* A packet error means descriptor and data are untrusted */
> - packet_error = enic_cq_rx_to_pkt_err_flags(,
> _err_flags);
> + packet_error = enic_cq_rx_check_err();
> 
>   /* Get the mbuf to return and replace with one just allocated
> */
>   rxmb = rq->mbuf_ring[rx_id];
> @@ -318,6 +312,13 @@ enic_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts,
>   rqd_ptr->address = rte_cpu_to_le_64(dma_addr);
>   rqd_ptr->length_type = cpu_to_le16(nmb->buf_len);
> 
> + /* Drop incoming bad packet */
> + if (packet_error) {
> + rte_pktmbuf_free(rxmb);
> + nb_err++;
> + continue;
> + }
> +
>   /* Fill in the rest of the mbuf */
>   rxmb->data_off = RTE_PKTMBUF_HEADROOM;
>   rxmb->nb_segs = 1;
> @@ -327,10 +328,6 @@ enic_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts,
>   rxmb->pkt_len = enic_cq_rx_desc_n_bytes();
>   rxmb->packet_type =
> enic_cq_rx_flags_to_pkt_type();
>   

[dpdk-dev] [PATCH v4] eal: make hugetlb initialization more robust

2016-05-12 Thread Jianfeng Tan
This patch adds an option, --huge-trybest, to use a recover mechanism to
the case that there are not so many hugepages (declared in sysfs), which
can be used. It relys on a mem access to fault-in hugepages, and if fails
with SIGBUS, recover to previously saved stack environment with
siglongjmp().

Besides, this solution fixes an issue when hugetlbfs is specified with an
option of size. Currently DPDK does not respect the quota of a hugetblfs
mount. It fails to init the EAL because it tries to map the number of free
hugepages in the system rather than using the number specified in the quota
for that mount.

It's still an open issue with CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS. Under
this case (such as IVSHMEM target), having hugetlbfs mounts with quota will
fail to remap hugepages as it relies on having mapped all free hugepages
in the system.

Test example:
  a. cgcreate -g hugetlb:/test-subgroup
  b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
  c. cgexec -g hugetlb:test-subgroup \
  ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest

Signed-off-by: Jianfeng Tan 
Acked-by: Neil Horman 
---
v4:
 - Change map_all_hugepages to return unsigned instead of int.
v3:
 - Reword commit message to include it fixes the hugetlbfs quota issue.
 - setjmp -> sigsetjmp.
 - Fix RTE_LOG complaint from ERR to DEBUG as it does not mean init error
   so far.
 - Fix the second map_all_hugepages's return value check.
v2:
 - Address the compiling error by move setjmp into a wrap method.

 lib/librte_eal/common/eal_common_options.c |   4 +
 lib/librte_eal/common/eal_internal_cfg.h   |   1 +
 lib/librte_eal/common/eal_options.h|   2 +
 lib/librte_eal/linuxapp/eal/eal.c  |   1 +
 lib/librte_eal/linuxapp/eal/eal_memory.c   | 118 +
 5 files changed, 112 insertions(+), 14 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_options.c 
b/lib/librte_eal/common/eal_common_options.c
index 3efc90f..e9a111d 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@ eal_long_options[] = {
{OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM},
{OPT_VMWARE_TSC_MAP,0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
{OPT_XEN_DOM0,  0, NULL, OPT_XEN_DOM0_NUM },
+   {OPT_HUGE_TRYBEST,  0, NULL, OPT_HUGE_TRYBEST_NUM },
{0, 0, NULL, 0}
 };

@@ -899,6 +900,9 @@ eal_parse_common_option(int opt, const char *optarg,
return -1;
}
break;
+   case OPT_HUGE_TRYBEST_NUM:
+   internal_config.huge_trybest = 1;
+   break;

/* don't know what to do, leave this to caller */
default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h 
b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367e..90a3533 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -64,6 +64,7 @@ struct internal_config {
volatile unsigned force_nchannel; /**< force number of channels */
volatile unsigned force_nrank;/**< force number of ranks */
volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */
+   volatile unsigned huge_trybest;   /**< try best to allocate hugepages */
unsigned hugepage_unlink; /**< true to unlink backing files */
volatile unsigned xen_dom0_support; /**< support app running on Xen 
Dom0*/
volatile unsigned no_pci; /**< true to disable PCI */
diff --git a/lib/librte_eal/common/eal_options.h 
b/lib/librte_eal/common/eal_options.h
index a881c62..02397c5 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@ enum {
OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0  "xen-dom0"
OPT_XEN_DOM0_NUM,
+#define OPT_HUGE_TRYBEST  "huge-trybest"
+   OPT_HUGE_TRYBEST_NUM,
OPT_LONG_MAX_NUM
 };

diff --git a/lib/librte_eal/linuxapp/eal/eal.c 
b/lib/librte_eal/linuxapp/eal/eal.c
index 8aafd51..eeb1d4e 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -343,6 +343,7 @@ eal_usage(const char *prgname)
   "  --"OPT_CREATE_UIO_DEV"Create /dev/uioX (usually done by 
hotplug)\n"
   "  --"OPT_VFIO_INTR" Interrupt mode for VFIO 
(legacy|msi|msix)\n"
   "  --"OPT_XEN_DOM0"  Support running on Xen dom0 without 
hugetlbfs\n"
+  "  --"OPT_HUGE_TRYBEST"  Try best to accommodate hugepages\n"
   "\n");
/* Allow the application to print its usage message too if hook is set 
*/
if ( rte_application_usage_hook ) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..8c77010 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++