[dpdk-dev] [PATCH v3 4/4] virtio: check if kernel driver is manipulating the virtio device

2016-01-27 Thread Huawei Xie
v3 changes:
 change log message to tell user that the virtio device is skipped
due to it is managed by kernel driver, instead of asking user to
unbind it from kernel driver.

v2 changes:
 change LOG level from ERR to INFO

virtio PMD could use IO port to configure the virtio device without
using uio driver(vfio-noniommu mode should work as well).

There are two issues with previous implementation:
1) virtio PMD will take over each virtio device blindly even if some
are not intended for DPDK.
2) driver conflict between virtio PMD and virtio-net kernel driver.

This patch checks if there is any kernel driver manipulating the virtio
device before virtio PMD uses IO port to configure the device.

Fixes: da978dfdc43b ("virtio: use port IO to get PCI resource")

Signed-off-by: Huawei Xie 
---
 drivers/net/virtio/virtio_ethdev.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/virtio/virtio_ethdev.c 
b/drivers/net/virtio/virtio_ethdev.c
index e815acd..ea1874a 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -1138,6 +1138,11 @@ static int virtio_resource_init_by_ioports(struct 
rte_pci_device *pci_dev)
int found = 0;
size_t linesz;

+   if (pci_dev->kdrv != RTE_KDRV_NONE) {
+   PMD_INIT_LOG(INFO, "skip kernel managed virtio device.");
+   return -1;
+   }
+
snprintf(pci_id, sizeof(pci_id), PCI_PRI_FMT,
 pci_dev->addr.domain,
 pci_dev->addr.bus,
-- 
1.8.1.4



[dpdk-dev] [PATCH v3 3/4] virtio: return 1 to tell the upper layer we don't take over this device

2016-01-27 Thread Huawei Xie
v2 changes:
 Remove unnecessary assignment of NULL to dev->data->mac_addrs
 Ajust one comment's position

if virtio_resource_init fails, cleanup the resource and return 1 to
tell the upper layer we don't take over this device. -1 means error
which will cause DPDK to exit.

Signed-off-by: Huawei Xie 
---
 drivers/net/virtio/virtio_ethdev.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio/virtio_ethdev.c 
b/drivers/net/virtio/virtio_ethdev.c
index d928339..e815acd 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -1287,8 +1287,13 @@ eth_virtio_dev_init(struct rte_eth_dev *eth_dev)

pci_dev = eth_dev->pci_dev;

-   if (virtio_resource_init(pci_dev) < 0)
-   return -1;
+   if (virtio_resource_init(pci_dev) < 0) {
+   rte_free(eth_dev->data->mac_addrs);
+   /* Return 1 to tell the upper layer we don't take over
+* this device.
+*/
+   return 1;
+   }

hw->use_msix = virtio_has_msix(_dev->addr);
hw->io_base = (uint32_t)(uintptr_t)pci_dev->mem_resource[0].addr;
-- 
1.8.1.4



[dpdk-dev] [PATCH v3 2/4] eal: set kdrv to RTE_KDRV_NONE if kernel driver isn't manipulating the device.

2016-01-27 Thread Huawei Xie
Use RTE_KDRV_NONE to indicate that kernel driver isn't manipulating
the device.

Signed-off-by: Huawei Xie 
Acked-by: David Marchand 
---
 lib/librte_eal/linuxapp/eal/eal_pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c 
b/lib/librte_eal/linuxapp/eal/eal_pci.c
index bc5b5be..640b190 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -362,7 +362,7 @@ pci_scan_one(const char *dirname, uint16_t domain, uint8_t 
bus,
else
dev->kdrv = RTE_KDRV_UNKNOWN;
} else
-   dev->kdrv = RTE_KDRV_UNKNOWN;
+   dev->kdrv = RTE_KDRV_NONE;

/* device is valid, add in list (sorted) */
if (TAILQ_EMPTY(_device_list)) {
-- 
1.8.1.4



[dpdk-dev] [PATCH v3 1/4] eal: make the comment more accurate

2016-01-27 Thread Huawei Xie
positive return of rte_eal_pci_probe_one_driver means the driver doesn't 
support the device.

Signed-off-by: Huawei Xie 
---
 lib/librte_eal/common/eal_common_pci.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_pci.c 
b/lib/librte_eal/common/eal_common_pci.c
index dcfe947..bbcdb2b 100644
--- a/lib/librte_eal/common/eal_common_pci.c
+++ b/lib/librte_eal/common/eal_common_pci.c
@@ -204,7 +204,7 @@ rte_eal_pci_probe_one_driver(struct rte_pci_driver *dr, 
struct rte_pci_device *d
/* call the driver devinit() function */
return dr->devinit(dr, dev);
}
-   /* return positive value if driver is not found */
+   /* return positive value if driver doesn't support this device */
return 1;
 }

@@ -259,7 +259,7 @@ rte_eal_pci_detach_dev(struct rte_pci_driver *dr,
return 0;
}

-   /* return positive value if driver is not found */
+   /* return positive value if driver doesn't support this device */
return 1;
 }

@@ -283,7 +283,7 @@ pci_probe_all_drivers(struct rte_pci_device *dev)
/* negative value is an error */
return -1;
if (rc > 0)
-   /* positive value means driver not found */
+   /* positive value means driver doesn't support it */
continue;
return 0;
}
@@ -310,7 +310,7 @@ pci_detach_all_drivers(struct rte_pci_device *dev)
/* negative value is an error */
return -1;
if (rc > 0)
-   /* positive value means driver not found */
+   /* positive value means driver doesn't support it */
continue;
return 0;
}
-- 
1.8.1.4



[dpdk-dev] [PATCH v3 0/4] fix the issue that DPDK takes over virtio device blindly

2016-01-27 Thread Huawei Xie
v3 changes:
 change log message to tell user that the virtio device is skipped
due to it is managed by kernel driver, instead of asking user to
unbind it from kernel driver.

v2 changes:
 Remove unnecessary assignment of NULL to dev->data->mac_addrs
 Ajust one comment's position
 change LOG level from ERR to INFO

virtio PMD doesn't set RTE_PCI_DRV_NEED_MAPPING in drv_flags of its
eth_driver. It will try igb_uio and PORT IO in turn to configure
virtio device. Even user in guest VM doesn't want to use virtio for
DPDK, virtio PMD will take over the device blindly.

The more serious problem is kernel driver is still manipulating the
device, which causes driver conflict.

This patch checks if there is any kernel driver manipulating the
virtio device before virtio PMD uses port IO to configure the device.

Huawei Xie (4):
  eal: make the comment more accurate
  eal: set kdrv to RTE_KDRV_NONE if kernel driver isn't manipulating the device.
  virtio: return 1 to tell the kernel we don't take over this device
  virtio: check if kernel driver is manipulating the virtio device

 drivers/net/virtio/virtio_ethdev.c | 14 --
 lib/librte_eal/common/eal_common_pci.c |  8 
 lib/librte_eal/linuxapp/eal/eal_pci.c  |  2 +-
 3 files changed, 17 insertions(+), 7 deletions(-)

-- 
1.8.1.4



[dpdk-dev] [dpdk-dev,v2] Clean up rte_memcpy.h file

2016-01-27 Thread Zhihong Wang
> Remove unnecessary type casting in functions.
> 
> Tested on Ubuntu (14.04 x86_64) with "make test".
> "make test" results match the results with baseline.
> "Memcpy perf" results match the results with baseline.
> 
> Signed-off-by: Ravi Kerur 
> Acked-by: Stephen Hemminger 
> 
> ---
> .../common/include/arch/x86/rte_memcpy.h   | 340 +++--
>  1 file changed, 175 insertions(+), 165 deletions(-)
> 
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
> b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> index 6a57426..839d4ec 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h

[...]

>  /**
> @@ -150,13 +150,16 @@ rte_mov64blocks(uint8_t *dst, const uint8_t *src, 
> size_t n)
>   __m256i ymm0, ymm1;
>  
>   while (n >= 64) {
> - ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
> *)src + 0 * 32));
> +
> + ymm0 = _mm256_loadu_si256((const __m256i *)(src + 0 * 32));
> + ymm1 = _mm256_loadu_si256((const __m256i *)(src + 1 * 32));
> +
> + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), ymm0);
> + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), ymm1);
> +

Any particular reason to change the order of the statements here? :)
Overall this patch looks good.

>   n -= 64;
> - ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t 
> *)src + 1 * 32));
> - src = (const uint8_t *)src + 64;
> - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
> - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
> - dst = (uint8_t *)dst + 64;
> + src = src + 64;
> + dst = dst + 64;
>   }
>  }
>  



[dpdk-dev] [PATCH v6 08/10] virtio_pci: do not parse if interface is vfio

2016-01-27 Thread Santosh Shukla
If virtio interface attached to vfio-noiommu driver then
do not parse for virtio resource. Instead exit with return 0;

Note: Applicable for virtio spec 0.95.

Signed-off-by: Santosh Shukla 
---
v5-->v6:
- Replaced pci_dev->kdrv check from __noiommu to default;
  This is because patch [1] in v5 series not required.
  [1] http://dpdk.org/dev/patchwork/patch/9984/

 drivers/net/virtio/virtio_pci.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio/virtio_pci.c b/drivers/net/virtio/virtio_pci.c
index 0c29f1d..deec306 100644
--- a/drivers/net/virtio/virtio_pci.c
+++ b/drivers/net/virtio/virtio_pci.c
@@ -514,7 +514,9 @@ virtio_resource_init_by_ioports(struct rte_pci_device 
*pci_dev)
 static int
 legacy_virtio_resource_init(struct rte_pci_device *pci_dev)
 {
-   if (virtio_resource_init_by_uio(pci_dev) == 0)
+   if (pci_dev->kdrv == RTE_KDRV_VFIO)
+   return 0;
+   else if (virtio_resource_init_by_uio(pci_dev) == 0)
return 0;
else
return virtio_resource_init_by_ioports(pci_dev);
-- 
1.7.9.5



[dpdk-dev] [PATCH v6 08/11] eal: pci: introduce RTE_KDRV_VFIO_NOIOMMUi driver mode

2016-01-27 Thread Santosh Shukla
On Wed, Jan 27, 2016 at 9:26 PM, Santosh Shukla  wrote:
> On Wed, Jan 27, 2016 at 9:09 PM, Thomas Monjalon
>  wrote:
>> 2016-01-27 21:02, Santosh Shukla:
>>> 1. virtio currently works for vfio+noiommu and likely will work for
>>> vfio+iommu in near future.
>>> 2. So remove __noiommu suffix and always use default.
>>> 3. Introduce vfio resource parsing global function, That function
>>> suppose to do parsing for default vfio case and for vfio-noiommu case.
>>> This function will be used by pmd drivers for resource parsing purpose
>>> example virtio.
>>>
>>> Yuan won't be happy with 3) I guess, because he wanted to get rid of
>>> interface parsing from pmd driver.
>>>
>>> Thomas, if 1/2/3/ addresses your concern then I'll spin the series,
>>
>> I agree with 1/ and 2/.
>> Please, could you explain why 3/ is needed?
>
> Because someone should do resource parsing / validation before driver
> does resource mapping/initialization. That someone could be either EAL
> layer or driver itself.
>
> In my case;
> - driver is virtio
> - resource is vfio interface

FWIW, removed 3) / Removed this patch entirely from this series,
Sending v6 version for effected patch [09/11]..


[dpdk-dev] [dpdk-dev, v3] Implement memcmp using Intel SIMD instrinsics.

2016-01-27 Thread Zhihong Wang
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcmp.h b/lib
> /librte_eal/common/include/arch/x86/rte_memcmp.h

[...]

> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * Compare bytes between two locations. The locations must not overlap.
> + *

Parameter names should be kept consistent as they are in function body.

> + * @param src_1
> + *   Pointer to the first source of the data.
> + * @param src_2
> + *   Pointer to the second source of the data.
> + * @param n
> + *   Number of bytes to compare.
> + * @return
> + *   zero if src_1 equal src_2
> + *   -ve if src_1 less than src_2
> + *   +ve if src_1 greater than src_2
> + */
> +static inline int
> +rte_memcmp(const void *src_1, const void *src,
> + size_t n) __attribute__((always_inline));
> +
> +/**
> + * Find the first different bit for comparison.
> + */
> +static inline int
> +rte_cmpffd (uint32_t x, uint32_t y)
> +{
> + int i;
> + int pos = x ^ y;
> + for (i = 0; i < 32; i++)
> + if (pos & (1< + return i;
> + return -1;
> +}
> +

[...]

> +/**
> + * Compare 48 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp48(const void *src_1, const void *src_2)

Guess this is not used.

[...]

> +/**
> + * Compare 256 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp256(const void *src_1, const void *src_2)
> +{
> + int ret;
> +
> + ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
> + (const uint8_t *)src_2 + 0 * 64);

Why not just use rte_cmp128?


[...]

> +static inline int
> +rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
> +{
> + const uint8_t *src_1 = (const uint8_t *)_src_1;
> + const uint8_t *src_2 = (const uint8_t *)_src_2;
> + int ret = 0;
> +
> + if (n < 16)
> + return rte_memcmp_regular(src_1, src_2, n);
> +
> + if (n <= 32) {
> + ret = rte_cmp16(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }
> +

Too many conditions here may harm the overall performance.
It's a trade-off thing, all about balancing the overhead.
Just make sure this is tuned based on actual test numbers.


> + if (n <= 48) {
> + ret = rte_cmp32(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }
> +
> + if (n <= 64) {
> + ret = rte_cmp32(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp16(src_1 + 32, src_2 + 32);
> +
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }
> +
> + if (n <= 96) {
> + ret = rte_cmp64(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp16(src_1 + 64, src_2 + 64);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }
> +
> + if (n <= 128) {
> + ret = rte_cmp64(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp32(src_1 + 64, src_2 + 64);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp16(src_1 + 96, src_2 + 96);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }

[...]

> +/**
> + * Compare 48 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp48(const void *src_1, const void *src_2)

Not used.

> +{
> + int ret;
> +
> + ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> + (const uint8_t *)src_2 + 0 * 16);
> +
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
> + (const uint8_t *)src_2 + 1 * 16);
> +
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16((const uint8_t *)src_1 + 2 * 16,
> + (const uint8_t *)src_2 + 2 * 16);
> +}
> +
> +/**
> + * Compare 64 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp64(const void *src_1, const void *src_2)
> +{
> + int ret;
> +
> + ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> + (const uint8_t *)src_2 + 0 * 16);

Why not rte_cmp32? And use rte_cmp64 for rte_cmp128, and so 

[dpdk-dev] [PATCH v3] remove extra parentheses in return statement

2016-01-27 Thread Huawei Xie
v3 changes:
 remove other extra parentheses in 'return (logical expressions)'
which checkpatch doesn't report as error
 remove extra parentheses in return statement which crosses
multiple line
 fix the document

v2 changes:
 add missed commit message in v1

fix the error reported by checkpatch:
  "ERROR: return is not a function, parentheses are not required"

remove parentheses in return like:
  "return (logical expressions)"

remove parentheses in return a function like:
  "return (rte_mempool_lookup(...))"

Fixes: 6307b909b8e0 ("lib: remove extra parenthesis after return")

Signed-off-by: Huawei Xie 
---
 app/test-pmd/cmdline.c | 12 ++--
 app/test-pmd/config.c  |  2 +-
 app/test-pmd/flowgen.c |  2 +-
 app/test-pmd/mempool_anon.c| 12 ++--
 app/test-pmd/testpmd.h |  2 +-
 app/test-pmd/txonly.c  |  2 +-
 app/test/test_kni.c|  2 +-
 app/test/test_mbuf.c   | 12 ++--
 app/test/test_memcpy_perf.c|  4 +-
 app/test/test_mempool.c|  4 +-
 app/test/test_memzone.c| 24 +++
 app/test/test_red.c| 42 ++--
 app/test/test_ring.c   |  4 +-
 doc/guides/sample_app_ug/ipv4_multicast.rst|  8 +--
 drivers/crypto/aesni_mb/rte_aesni_mb_pmd_ops.c |  2 +-
 drivers/crypto/qat/qat_crypto.c|  4 +-
 drivers/crypto/qat/qat_qp.c| 22 +++---
 drivers/net/bnx2x/bnx2x.c  | 34 -
 drivers/net/bnx2x/bnx2x.h  |  4 +-
 drivers/net/bnx2x/bnx2x_rxtx.c | 16 ++---
 drivers/net/bnx2x/debug.c  |  6 +-
 drivers/net/bnx2x/elink.c  |  2 +-
 drivers/net/bonding/rte_eth_bond_pmd.c |  2 +-
 drivers/net/cxgbe/cxgbe_main.c |  2 +-
 drivers/net/e1000/em_ethdev.c  | 40 +--
 drivers/net/e1000/em_rxtx.c| 46 ++---
 drivers/net/e1000/igb_ethdev.c | 22 +++---
 drivers/net/e1000/igb_rxtx.c   | 30 
 drivers/net/enic/enic_clsf.c   |  2 +-
 drivers/net/fm10k/fm10k_ethdev.c   | 40 +--
 drivers/net/i40e/i40e_ethdev.c |  2 +-
 drivers/net/i40e/i40e_ethdev.h |  2 +-
 drivers/net/i40e/i40e_ethdev_vf.c  |  2 +-
 drivers/net/i40e/i40e_rxtx.c   | 14 ++--
 drivers/net/ixgbe/ixgbe_82599_bypass.c |  4 +-
 drivers/net/ixgbe/ixgbe_bypass.c   |  2 +-
 drivers/net/ixgbe/ixgbe_ethdev.c   | 34 -
 drivers/net/ixgbe/ixgbe_rxtx.c | 36 +-
 drivers/net/mlx5/mlx5_rxq.c|  2 +-
 drivers/net/mlx5/mlx5_utils.h  |  2 +-
 drivers/net/mpipe/mpipe_tilegx.c   |  4 +-
 drivers/net/nfp/nfp_net.c  | 16 ++---
 drivers/net/virtio/virtio_ethdev.c |  6 +-
 drivers/net/vmxnet3/vmxnet3_ring.h |  2 +-
 drivers/net/xenvirt/virtqueue.h|  2 +-
 examples/ip_pipeline/cpu_core_map.c|  2 +-
 .../pipeline/pipeline_flow_actions_be.c|  2 +-
 examples/ip_reassembly/main.c  | 22 +++---
 examples/ipv4_multicast/main.c | 14 ++--
 examples/l3fwd/main.c  |  4 +-
 .../client_server_mp/mp_server/init.c  |  2 +-
 examples/multi_process/symmetric_mp/main.c |  2 +-
 examples/netmap_compat/bridge/bridge.c |  8 +--
 examples/netmap_compat/lib/compat_netmap.c | 80 +++---
 examples/performance-thread/common/lthread_queue.h |  2 +-
 examples/performance-thread/common/lthread_sched.c |  4 +-
 examples/qos_sched/args.c  |  2 +-
 examples/quota_watermark/qw/main.h |  2 +-
 examples/vhost/main.c  |  4 +-
 examples/vhost_xen/main.c  |  4 +-
 examples/vhost_xen/vhost_monitor.c |  6 +-
 lib/librte_acl/acl_bld.c   |  4 +-
 lib/librte_acl/acl_run_neon.h  |  2 +-
 lib/librte_cfgfile/rte_cfgfile.c   |  4 +-
 lib/librte_cryptodev/rte_cryptodev.c   | 24 +++
 lib/librte_eal/bsdapp/eal/eal_lcore.c  |  2 +-
 lib/librte_eal/common/eal_common_memzone.c |  2 +-
 .../common/include/arch/ppc_64/rte_atomic.h| 12 ++--
 .../common/include/arch/ppc_64/rte_byteorder.h | 10 +--
 .../common/include/arch/ppc_64/rte_spinlock.h  |  2 +-
 .../common/include/arch/x86/rte_atomic.h   | 

[dpdk-dev] DPDK OVS on Ubuntu 14.04# Issue's Resolved# Inter-VM communication & IP allocation through DHCP issue

2016-01-27 Thread Abhijeet Karve
Hi Przemek,

Thanks for the quick response. Now  able to get the DHCP ip's for 2 
vhostuser instances and able to ping each other. Isssue was a bug in 
cirros 0.3.0 images which we were using in openstack after using 0.3.1 
image as given in the URL(
https://www.redhat.com/archives/rhos-list/2013-August/msg00032.html), able 
to get the IP's in vhostuser VM instances.

As per our understanding, Packet flow across DPDK datapath will be like 
vhostuser ports are connected to the br-int bridge & same is being patched 
to the br-dpdk bridge where in our physical network (NIC) is connected 
with dpdk0 port. 

So for testing the flow we have to connect that physical network(NIC) with 
external packet generator (e.g - ixia, iperf) & run the testpmd 
application in the vhostuser VM, right?

Does it required to add any flows/efforts in bridge configurations(either 
br-int or br-dpdk)?


Thanks & Regards
Abhijeet Karve




From:   "Czesnowicz, Przemyslaw" 
To: Abhijeet Karve 
Cc: "dev at dpdk.org" , "discuss at openvswitch.org" 
, "Gray, Mark D" 
Date:   01/27/2016 05:11 PM
Subject:RE: [dpdk-dev] DPDK OVS on Ubuntu 14.04# Issue's Resolved# 
Inter-VM communication & IP allocation through DHCP issue



Hi Abhijeet,


It seems you are almost there! 
When booting the VM?s do you request hugepage memory for them (by setting 
hw:mem_page_size=large in flavor extra_spec)?
If not then please do, if yes then please look into libvirt logfiles for 
the VM?s (in /var/log/libvirt/qemu/instance-xxx), I think there could be a 
clue.


Regards
Przemek

From: Abhijeet Karve [mailto:abhijeet.ka...@tcs.com] 
Sent: Monday, January 25, 2016 6:13 PM
To: Czesnowicz, Przemyslaw
Cc: dev at dpdk.org; discuss at openvswitch.org; Gray, Mark D
Subject: RE: [dpdk-dev] DPDK OVS on Ubuntu 14.04# Issue's Resolved# 
Inter-VM communication & IP allocation through DHCP issue

Hi Przemek, 

Thank you for your response, It really provided us breakthrough. 

After setting up DPDK on compute node for stable/kilo, We are trying to 
set up Openstack stable/liberty all-in-one setup, At present we are not 
able to get the IP allocation for the vhost type instances through DHCP. 
Also we tried assigning IP's manually to them but the inter-VM 
communication also not happening, 

#neutron agent-list 
root at nfv-dpdk-devstack:/etc/neutron# neutron agent-list 
+--++---+---++---+
 

| id   | agent_type | host  | 
alive | admin_state_up | binary| 
+--++---+---++---+
 

| 3b29e93c-3a25-4f7d-bf6c-6bb309db5ec0 | DPDK OVS Agent | 
nfv-dpdk-devstack | :-)   | True   | neutron-openvswitch-agent | 
| 62593b2c-c10f-4d93-8551-c46ce24895a6 | L3 agent   | 
nfv-dpdk-devstack | :-)   | True   | neutron-l3-agent  | 
| 7cb97af9-cc20-41f8-90fb-aba97d39dfbd | DHCP agent | 
nfv-dpdk-devstack | :-)   | True   | neutron-dhcp-agent| 
| b613c654-99b7-437e-9317-20fa651a1310 | Linux bridge agent | 
nfv-dpdk-devstack | :-)   | True   | neutron-linuxbridge-agent | 
| c2dd0384-6517-4b44-9c25-0d2825d23f57 | Metadata agent | 
nfv-dpdk-devstack | :-)   | True   | neutron-metadata-agent| 
| f23dde40-7dc0-4f20-8b3e-eb90ddb15e49 | Open vSwitch agent | 
nfv-dpdk-devstack | xxx   | True   | neutron-openvswitch-agent | 
+--++---+---++---+
 



ovs-vsctl show output# 
 
Bridge br-dpdk 
Port br-dpdk 
Interface br-dpdk 
type: internal 
Port phy-br-dpdk 
Interface phy-br-dpdk 
type: patch 
options: {peer=int-br-dpdk} 
Bridge br-int 
fail_mode: secure 
Port "vhufa41e799-f2" 
tag: 5 
Interface "vhufa41e799-f2" 
type: dpdkvhostuser 
Port int-br-dpdk 
Interface int-br-dpdk 
type: patch 
options: {peer=phy-br-dpdk} 
Port "tap4e19f8e1-59" 
tag: 5 
Interface "tap4e19f8e1-59" 
type: internal 
Port "vhu05734c49-3b" 
tag: 5 
Interface "vhu05734c49-3b" 
type: dpdkvhostuser 
Port "vhu10c06b4d-84" 
tag: 5 
Interface "vhu10c06b4d-84" 
type: dpdkvhostuser 
Port patch-tun 
Interface patch-tun 
type: patch 
options: {peer=patch-int} 
Port "vhue169c581-ef" 
tag: 5 
Interface "vhue169c581-ef" 
 

[dpdk-dev] [PATCH v6 08/11] eal: pci: introduce RTE_KDRV_VFIO_NOIOMMUi driver mode

2016-01-27 Thread Santosh Shukla
On Wed, Jan 27, 2016 at 9:09 PM, Thomas Monjalon
 wrote:
> 2016-01-27 21:02, Santosh Shukla:
>> 1. virtio currently works for vfio+noiommu and likely will work for
>> vfio+iommu in near future.
>> 2. So remove __noiommu suffix and always use default.
>> 3. Introduce vfio resource parsing global function, That function
>> suppose to do parsing for default vfio case and for vfio-noiommu case.
>> This function will be used by pmd drivers for resource parsing purpose
>> example virtio.
>>
>> Yuan won't be happy with 3) I guess, because he wanted to get rid of
>> interface parsing from pmd driver.
>>
>> Thomas, if 1/2/3/ addresses your concern then I'll spin the series,
>
> I agree with 1/ and 2/.
> Please, could you explain why 3/ is needed?

Because someone should do resource parsing / validation before driver
does resource mapping/initialization. That someone could be either EAL
layer or driver itself.

In my case;
- driver is virtio
- resource is vfio interface


[dpdk-dev] [PATCH v2 0/5] Optimize memcpy for AVX512 platforms

2016-01-27 Thread Thomas Monjalon
2016-01-27 18:48, Ananyev, Konstantin:
> From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com]
> > 
> > > Zhihong Wang (5):
> > >   lib/librte_eal: Identify AVX512 CPU flag
> > >   mk: Predefine AVX512 macro for compiler
> > >   lib/librte_eal: Optimize memcpy for AVX512 platforms
> > >   app/test: Adjust alignment unit for memcpy perf test
> > >   lib/librte_eal: Tune memcpy for prior platforms
> > >
> > >  app/test/test_memcpy_perf.c|   6 +
> > >  .../common/include/arch/x86/rte_cpuflags.h |   2 +
> > >  .../common/include/arch/x86/rte_memcpy.h   | 269 
> > > -
> > >  mk/rte.cpuflags.mk |   4 +
> > >  4 files changed, 268 insertions(+), 13 deletions(-)
> > 
> > The maintainers of arch/x86 are Bruce and Konstantin.
> > I guess there is no comment and we can apply this cool series?
> 
> Yes, looks ok to me.

Applied, thanks

Some benchmark feedbacks would be welcome.


[dpdk-dev] [PATCH v2] doc: introduce networking driver matrix

2016-01-27 Thread Thomas Monjalon
In order to better compare the drivers and check what is missing
for a common baseline, we need to fill a matrix.

A CSS trick is used to fit the HTML page.
The PDF output needs some LaTeX wizardry.

Signed-off-by: Thomas Monjalon 
---
v2: add vector PMDs
---
 doc/guides/nics/index.rst|   1 +
 doc/guides/nics/overview.rst | 147 +++
 2 files changed, 148 insertions(+)
 create mode 100644 doc/guides/nics/overview.rst

diff --git a/doc/guides/nics/index.rst b/doc/guides/nics/index.rst
index 33c9cea..8618114 100644
--- a/doc/guides/nics/index.rst
+++ b/doc/guides/nics/index.rst
@@ -35,6 +35,7 @@ Network Interface Controller Drivers
 :maxdepth: 3
 :numbered:

+overview
 bnx2x
 cxgbe
 e1000em
diff --git a/doc/guides/nics/overview.rst b/doc/guides/nics/overview.rst
new file mode 100644
index 000..d4c6ff4
--- /dev/null
+++ b/doc/guides/nics/overview.rst
@@ -0,0 +1,147 @@
+..  BSD LICENSE
+Copyright 2016 6WIND S.A.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+* Neither the name of 6WIND S.A. nor the names of its
+contributors may be used to endorse or promote products derived
+from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Overview of Networking Drivers
+==
+
+The networking drivers may be classified in two categories:
+
+- physical for real devices
+- virtual for emulated devices
+
+Some physical devices may be shaped through a virtual layer as for
+SR-IOV.
+The interface seen in the virtual environment is a VF (Virtual Function).
+
+The ethdev layer exposes an API to use the networking functions
+of these devices.
+The bottom half part of ethdev is implemented by the drivers.
+Thus some features may not be implemented.
+
+There are more differences between drivers regarding some internal properties,
+portability or even documentation availability.
+Most of these differences are summarized below.
+
+.. _table_net_pmd_features:
+
+.. raw:: html
+
+   
+  table#id1 th {
+ font-size: 80%;
+ white-space: pre-wrap;
+ text-align: center;
+ vertical-align: top;
+ padding: 3px;
+  }
+  table#id1 th:first-child {
+ vertical-align: bottom;
+  }
+  table#id1 td {
+ font-size: 70%;
+ padding: 1px;
+  }
+  table#id1 td:first-child {
+ padding-left: 1em;
+  }
+   
+
+.. table:: Features availability in networking drivers
+
+    = = = = = = = = = = = = = = = = = = = = = = = = = = = 
= = = =
+   Feature  a b b b c e e i i i i i i i i i i f f m m m n n p r s 
v v v x
+f n n o x 1 n 4 4 4 4 g g x x x x m m l l p f u c i z 
i i m e
+p x x n g 0 i 0 0 0 0 b b g g g g 1 1 x x i p l a n e 
r r x n
+a 2 2 d b 0 c e e e e   v b b b b 0 0 4 5 p   l p g d 
t t n v
+c x x i e 0 . v v   f e e e e k k e a 
i i e i
+k   v n . f f   . v v   .   t 
o o t r
+e   f g .   .   . f f   .   a  
 . 3 t
+t   v   v   v   v   v   2  
 v
+e   e   e   e   e  
 e
+c   c   c   c   c  
 c
+    = = = = = = = = = = = = = = = = = = = = = = = = = = = 
= = = =
+   link status
+   link status event
+   Rx interrupt
+   queue start/stop
+   MTU update
+   jumbo frame
+   scattered Rx
+   LRO
+   TSO
+   promiscuous mode
+   allmulticast mode

[dpdk-dev] [PATCH v6 08/11] eal: pci: introduce RTE_KDRV_VFIO_NOIOMMUi driver mode

2016-01-27 Thread Santosh Shukla
On Wed, Jan 27, 2016 at 4:11 PM, Santosh Shukla  wrote:
> On Tue, Jan 26, 2016 at 9:51 PM, Santosh Shukla  wrote:
>> On Tue, Jan 26, 2016 at 7:58 PM, Thomas Monjalon
>>  wrote:
>>> 2016-01-26 19:35, Santosh Shukla:
 On Tue, Jan 26, 2016 at 6:30 PM, Thomas Monjalon
  wrote:
 > 2016-01-26 15:56, Santosh Shukla:
 >> In my observation, currently virtio work for vfio-noiommu, that's why
 >> said drv->kdrv need to know vfio mode.
 >
 > It is your observation. It may change in near future.

 so that mean till then, virtio support for non-x86 arch has to wait?
>>>
>>> No, absolutely not. virtio for non-x86 is welcome.
>>>
 We have working model with vfio-noiommu, don't you think it make sense
 to let vfio_noiommu implementation exist and later in-case
 virtio+iommu gets mainline then switch to vfio __mode__ agnostic
 approach. And for that All it takes to replace __noiommu suffix with
 default.
>>>
>>> I'm just saying you should not touch the enum rte_kernel_driver.
>>> RTE_KDRV_VFIO is a driver.
>>> RTE_KDRV_VFIO_NOIOMMU is a mode.
>>> As the VFIO API is the same in both modes, there is no reason to
>>> distinguish them at this level.
>>> Your patch adds the NOIOMMU case everywhere:
>>> case RTE_KDRV_VFIO:
>>> +   case RTE_KDRV_VFIO_NOIOMMU:
>>>
>>> I'll stop commenting here to let others give their opinion.
>>>
>>> [...]
 >> with vfio+iommu; binding virtio pci device to vfio-pci driver fail;
 >> giving below error:
 >> [   53.053464] VFIO - User Level meta-driver version: 0.3
 >> [   73.077805] vfio-pci: probe of :00:03.0 failed with error -22
 >> [   73.077852] vfio-pci: probe of :00:03.0 failed with error -22
 >>
 >> vfio_pci_probe() --> vfio_iommu_group_get() --> iommu_group_get()
 >> fails: iommu doesn't have group for virtio pci device.
 >
 > Yes it fails when binding.
 > So the later check in the virtio PMD is useless.

 Which check?
>>>
>>> The check for VFIO noiommu only:
>>> -   if (dev->kdrv == RTE_KDRV_VFIO)
>>> +   if (dev->kdrv == RTE_KDRV_VFIO_NOIOMMU)
>>>
>>> [...]
 > Furthermore restricting virtio to no-iommu mode doesn't bring
 > any improvement.

 We're not __restricting__, as soon as virtio+iommu gets working state,
 we'll simply replace __noiommu with default. Then its upto user to try
 out virtio with vfio default or vfio_noiommu.
>>>
>>> Yes it's up to user.
>>> So your code should be
>>> if (dev->kdrv == RTE_KDRV_VFIO)
>>>
>>
>> Right,
>>
 > That's why I suggest to keep the initial semantic of kdrv and
 > not pollute it with VFIO modes.

 I am okay to live with default and forget suffix __noiommu but there
 are implementation problem which was discussed in other thread
 - Virtio pmd driver should avoid interface parsing i.e.
 virtio_resource_init_uio/vfio() etc.. For vfio case - We could easily
 get rid of by moving /sys parsing to pci_eal layer, Right? If so then
 virtio currently works with vfio-noiommu, it make sense to me that
 pci_eal layer does parsing for pmd driver before that pmd driver get
 initialized.
>>>
>>> Please reword. What is the problem?
>>>
 - Another case could be: iommu-less-pmd-driver. eal layer to do
 parsing before updating drv->kdrv.
>>>
>>> [...]
 >> >> > If a check is needed, I would prefer using your function
 >> >> > pci_vfio_is_noiommu() and remove driver modes from struct 
 >> >> > rte_kernel_driver.
 >> >>
 >> >> I don't think calling pci_vfio_no_iommu() inside
 >> >> virtio_reg_rd/wr_1/2/3() would be a good idea.
 >> >
 >> > Why? The value may be cached in the priv properties.
 >> >
 >> pci_vfio_is_noiommu() parses /sys for
 >> - enable_noiommu param
 >> - attached driver name is vfio-noiommu or not.
 >>
 >> It does file operation for that, I meant to say that calling this api
 >> within register_rd/wr function is not correct. It would be better if
 >> those low level register_rd/wr api only checks driver_types.
 >
 > Yes, that's why I said the return of pci_vfio_is_noiommu() may be cached
 > to keep efficiency.

 I am not convinced though, Still find pmd driver checking driver_types
 using drv->kdrv is better approach than introducing a new global
 variable which may look something like;
>>>
>>> Not a global variable. A function in EAL layer. A variable in PMD priv.
>>>
>>
>> If we agreed to use condition (drv->kdrv == RTE_KDRV_VFIO);
>> then resource parsing for vfio {including vfio and vfio_noiommu both
>> case} is enforced in virtio pmd driver layer and that is contradicting
>> to what we agreed earlier in this[1] thread. Also we don't need a
>> function in EAL layer or a variable in PMD priv. Perhaps a private
>> function in virtio pmd which does parsing for vfio interface.
>>
>> Thoughts?
>>
>> [1] http://dpdk.org/dev/patchwork/patch/9862/
>>

[dpdk-dev] [PATCH] tools: fix syntax errors and add support for Python 3

2016-01-27 Thread yurai
This patch fixes syntax errors from tools/setup.sh during binding ethernet 
device
on systems where Python 3 is default. Backward compability with Python 2 is 
preserved.

Signed-off-by: Dawid Jurczak 
---
 tools/dpdk_nic_bind.py | 74 +-
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/tools/dpdk_nic_bind.py b/tools/dpdk_nic_bind.py
index f02454e..9f7c848 100755
--- a/tools/dpdk_nic_bind.py
+++ b/tools/dpdk_nic_bind.py
@@ -54,7 +54,7 @@ args = []
 def usage():
 '''Print usage information for the program'''
 argv0 = basename(sys.argv[0])
-print """
+print ("""
 Usage:
 --

@@ -110,7 +110,7 @@ To unbind :01:00.0 from using any driver
 To bind :02:00.0 and :02:00.1 to the ixgbe kernel driver
 %(argv0)s -b ixgbe 02:00.0 02:00.1

-""" % locals() # replace items from local variables
+""" % locals()) # replace items from local variables

 # This is roughly compatible with check_output function in subprocess module
 # which is only available in python 2.7.
@@ -156,7 +156,7 @@ def check_modules():
 '''Checks that igb_uio is loaded'''
 global dpdk_drivers

-fd = file("/proc/modules")
+fd = open("/proc/modules", 'r')
 loaded_mods = fd.readlines()
 fd.close()

@@ -176,10 +176,10 @@ def check_modules():
 # check if we have at least one loaded module
 if True not in [mod["Found"] for mod in mods] and b_flag is not None:
 if b_flag in dpdk_drivers:
-print "Error - no supported modules(DPDK driver) are loaded"
+print ("Error - no supported modules(DPDK driver) are loaded")
 sys.exit(1)
 else:
-print "Warning - no supported modules(DPDK driver) are loaded"
+print ("Warning - no supported modules(DPDK driver) are loaded")

 # change DPDK driver list to only contain drivers that are loaded
 dpdk_drivers = [mod["Name"] for mod in mods if mod["Found"]]
@@ -198,7 +198,7 @@ def get_pci_device_details(dev_id):
 for line in extra_info:
 if len(line) == 0:
 continue
-name, value = line.split("\t", 1)
+name, value = line.decode().split("\t", 1)
 name = name.strip(":") + "_str"
 device[name] = value
 # check for a unix interface name
@@ -234,7 +234,7 @@ def get_nic_details():
 dev["Device"] = int(dev["Device"],16)
 devices[dev["Slot"]] = dict(dev) # use dict to make copy of dev
 else:
-name, value = dev_line.split("\t", 1)
+name, value = dev_line.decode().split("\t", 1)
 dev[name.rstrip(":")] = value

 # check what is the interface if any for an ssh connection if
@@ -243,17 +243,17 @@ def get_nic_details():
 route = check_output(["ip", "-o", "route"])
 # filter out all lines for 169.254 routes
 route = "\n".join(filter(lambda ln: not ln.startswith("169.254"),
- route.splitlines()))
+ route.decode().splitlines()))
 rt_info = route.split()
-for i in xrange(len(rt_info) - 1):
+for i in range(len(rt_info) - 1):
 if rt_info[i] == "dev":
 ssh_if.append(rt_info[i+1])

 # based on the basic info, get extended text details
 for d in devices.keys():
 # get additional info and add it to existing data
-devices[d] = dict(devices[d].items() +
-  get_pci_device_details(d).items())
+devices[d] = devices[d].copy()
+devices[d].update(get_pci_device_details(d).items())

 for _if in ssh_if:
 if _if in devices[d]["Interface"].split(","):
@@ -293,22 +293,22 @@ def dev_id_from_dev_name(dev_name):
 if dev_name in devices[d]["Interface"].split(","):
 return devices[d]["Slot"]
 # if nothing else matches - error
-print "Unknown device: %s. " \
-"Please specify device in \"bus:slot.func\" format" % dev_name
+print ("Unknown device: %s. " \
+"Please specify device in \"bus:slot.func\" format" % dev_name)
 sys.exit(1)

 def unbind_one(dev_id, force):
 '''Unbind the device identified by "dev_id" from its current driver'''
 dev = devices[dev_id]
 if not has_driver(dev_id):
-print "%s %s %s is not currently managed by any driver\n" % \
-(dev["Slot"], dev["Device_str"], dev["Interface"])
+print ("%s %s %s is not currently managed by any driver\n" % \
+(dev["Slot"], dev["Device_str"], dev["Interface"]))
 return

 # prevent us disconnecting ourselves
 if dev["Ssh_if"] and not force:
-print "Routing table indicates that interface %s is active" \
-". Skipping unbind" % (dev_id)
+print ("Routing table indicates that interface %s is active" \
+". Skipping unbind" % (dev_id))
 return

 # write to /sys to unbind
@@ -316,7 +316,7 @@ def unbind_one(dev_id, 

[dpdk-dev] DPDK mbuf pool in SR-IOV env and one RX/TX queue

2016-01-27 Thread Saurabh Mishra
Any clues or hint on how to debug this kind of problem on SR-IOV? Only
primary can send packet but secondary process couldn't. I have verified
host's qprc and qptc counters on PF and they do increment.

SR-IOV with DPDK seems more challenging than PCI pass through of whole NIC.

Saurabh
On Jan 26, 2016 12:19 PM, "Bruce Richardson" 
wrote:

> On Mon, Jan 25, 2016 at 04:15:28PM -0800, Saurabh Mishra wrote:
> > Hi Bruce --
> >
> > >The sharing of the mbuf pool is not an issue, but sharing of rx/tx
> queues
> > is.
> > >The ethdev queues are not multi-thread safe, so to share a queue between
> > processes
> > >or threads, you need to put in locks or other access control mechanisms.
> > [This
> > >also implies a performance hit due to the locking]
> > >Regards,
> > >/Bruce
> >
> > Right. So now we have only one process to do rx/tx on queue 0 if we
> detect
> > that max queue support is 1.
> >
> > However, we have noticed that if our process, which does rx/tx, is not
> > primary, then we can't transmit the packet out with SR-IOV.
> >
> > Is there any specific limitation on SR-IOV (the vf driver in dpdk) that
> > only primary process should receive and transmit packets?
> >
> > In our model, we have an agent process which monitor links and another
> > process which does packet processing. If we make our agent process as
> > primary then our secondary process is not able to send the packets --
> > rte_eth_tx_burst() succeed but recipient does not receive the packet.
> >
> > Thanks,
> > /Saurabh
>
> There should be no restrictions on RX/TX from secondary processes.
>
> /Bruce
>


[dpdk-dev] [RFC] eal: add cgroup-aware resource self discovery

2016-01-27 Thread Tan, Jianfeng
Hi Neil,

On 1/26/2016 10:19 PM, Neil Horman wrote:
> On Tue, Jan 26, 2016 at 10:22:18AM +0800, Tan, Jianfeng wrote:
>> Hi Neil,
>>
>> On 1/25/2016 9:46 PM, Neil Horman wrote:
>>> On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote:
>> ...
 -- 
 2.1.4


>>> This doesn't make a whole lot of sense, for several reasons:
>>>
>>> 1) Applications, as a general rule shouldn't be interrogating the cgroups
>>> interface at all.
>> The main reason to do this in DPDK is that DPDK obtains resource information
>> from sysfs and proc, which are not well containerized so far. And DPDK
>> pre-allocates resource instead of on-demand gradual allocating.
>>
> Not disagreeing with this, just suggesting that:
>
> 1) Interrogating cgroups really isn't the best way to collect that information
> 2) Pre-allocating those resources isn't particularly wise without some 
> mechanism
> to reallocate it, as resource constraints can change (consider your cpuset
> getting rewritten)

In the case of reallocate,
For cpuset, DPDK panics in the initialization if set_affinity fails, but 
after that, cpuset rewritten will not bring any problem I believe.
For memory, a running application uses 2G hugepages, then admin 
decreases hugetlb cgroup into 1G, the application will not get killed, 
unless it tries to access more hugepages (I'll double check this).

So another way to address this problem is to add an option that DPDK 
tries best to allocate those resources, and if fails, it just posts a 
warning and uses those allocated resources, instead of panic. What do 
you think?

>
>>> 2) Cgroups aren't the only way in which a cpuset or memoryset can be 
>>> restricted
>>> (the isolcpus command line argument, or a taskset on a parent process for
>>> instance, but there are several others).
>> Yes, I agree. To enable that, I'd like design the new API for resource self
>> discovery in a flexible way. A parameter "type" is used to specify the
>> solution to discovery way. In addition, I'm considering to add a callback
>> function pointer so that users can write their own resource discovery
>> functions.
>>
> Why?  You don't need an API for this, or if you really want one, it can be 
> very
> generic if you use POSIX apis to gather the information.  What you have here 
> is
> going to be very linux specific, and will need reimplementing for BSD or other
> operating systems.  To use the cpuset example, instead of reading and parsing
> the mask files in the cgroup filesystem module to find your task and
> corresponding mask, just call sched_setaffinity with an all f's mask, then 
> call
> sched_getaffinity.  The returned mask will be all the cpus your process is
> allowed to execute on, taking into account every limiting filter the system 
> you
> are running on offers.

Yes, it makes sense on cpu's side.

>
> There are simmilar OS level POSIX apis for most resources out there.  You 
> really
> don't need to dig through cgroups just to learn what some of those reources 
> are.
>
>>> Instead of trying to figure out what cpuset is valid for your process by
>>> interrogating the cgroups heirarchy, instead you should follow the 
>>> proscribed
>>> method of calling sched_getaffinity after calling sched_setaffinity.  That 
>>> will
>>> give you the canonical cpuset that you are executing on, taking all cpuset
>>> filters into account (including cgroups and any other restrictions).  Its 
>>> far
>>> simpler as well, as it doesn't require a ton of file/string processing.
>> Yes, this way is much better for cpuset discovery. But is there such a
>> syscall for hugepages?
>>
> In what capacity?  Interrogating how many hugepages you have, or to what node
> they are affined to?  Capacity would require reading the requisite proc file, 
> as
> theres no posix api for this resource.  Node affinity can be implied by 
> setting
> the numa policy of the dpdk and then writing to /proc/nr_hugepages, as the
> kernel will attempt to distribute hugepages evenly among the tasks' numa 
> policy
> configuration.

For memory affinity, I believe the existing way of reading 
/proc/self/pagemap already handle the problem. What I was asking is how 
much memory (or hugepages in Linux's case) can be used. By the way, what 
is /proc/nr_hugepages?

>
> That said, I would advise that you strongly consider not exporting hugepages 
> as
> a resource, as:
>
> a) Applications generally don't need to know that they are using hugepages, 
> and
> so they dont need to know where said hugepages live, they just allocate memory
> via your allocation api and you give them something appropriate

But the allocation api provider, DPDK library, needs to know if it's 
using hugepages or not.

> b) Hugepages are a resource that are very specific to Linux, and to X86 Linux 
> at
> that.  Some OS implement simmilar resources, but they may have very different
> semantics.  And other Arches may or may not implement various forms of 
> compound
> paging at all.  As the DPDK expands to 

[dpdk-dev] [PATCH v2] fix checkpatch errors

2016-01-27 Thread Thomas Monjalon
2016-01-27 01:26, Huawei Xie:
> v2 changes:
>  add missed commit message in v1
> 
> fix the error reported by checkpatch:
>  "ERROR: return is not a function, parentheses are not required"
> 
> also removed other extra parentheses like:
>  "return val == 0"
>  "return (rte_mempool_lookup(...))"

How these examples are differents from above checkpatch error?

Please add Fixes: 6307b909b8e0 ("lib: remove extra parenthesis after return")

This is the second run after above commit but I still see a lot of them.
Please check git grep 'return *('



[dpdk-dev] [PATCH] vfio/noiommu: Don't use iommu_present() to track fake groups

2016-01-27 Thread Santosh Shukla
On Wed, Jan 27, 2016 at 6:51 PM, Burakov, Anatoly
 wrote:
> Hi Alex,
>
>> On 01/23/2016 04:23 AM, Alex Williamson wrote:
>> > Using iommu_present() to determine whether an IOMMU group is real or
>> > fake has some problems.  First, apparently Power systems don't
>> > register an IOMMU on the device bus, so the groups and containers get
>> > marked as noiommu and then won't bind to their actual IOMMU driver.
>> > Second, I expect we'll run into the same issue as we try to support
>> > vGPUs through vfio, since they're likely to emulate this behavior of
>> > creating an IOMMU group on a virtual device and then providing a vfio
>> > IOMMU backend tailored to the sort of isolation they provide, which
>> > won't necessarily be fully compatible with the IOMMU API.
>> >
>> > The solution here is to use the existing iommudata interface to IOMMU
>> > groups, which allows us to easily identify the fake groups we've
>> > created for noiommu purposes.  The iommudata we set is purely
>> > arbitrary since we're only comparing the address, so we use the
>> > address of the noiommu switch itself.
>> >
>> > Reported-by: Alexey Kardashevskiy 
>> > Fixes: 03a76b60f8ba ("vfio: Include No-IOMMU mode")
>> > Signed-off-by: Alex Williamson 
>>
>>
>>
>> Reviewed-by: Alexey Kardashevskiy 
>> Tested-by: Alexey Kardashevskiy 
>
> Tested bringing the NIC's up, encountered no issues. Curious if it also works 
> for Santosh (CC'd) as he's one of the intended users of the No-IOMMU 
> functionality, but otherwise seems to work.
>

Yes, Its works for virtio dpdk case too, Tested-by:

Thanks.
> Thanks,
> Anatoly


[dpdk-dev] [PATCH] eal: add architecture specific rte_cpuflags.c files

2016-01-27 Thread Thomas Monjalon
2015-11-10 10:02, Ferruh Yigit:
> --- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
> +++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
> @@ -133,5 +133,6 @@ DPDK_2.2 {
>   global:
>  
>   rte_intr_cap_multiple;
> + cpu_feature_table;

As it is now an exported symbol, it should be prefixed with rte_.

Please take care when rebasing to
- create a new block of symbols for the new release version,
- and keep the new flag for AVX512 which is going to be applied.



[dpdk-dev] [PATCH v2 0/5] Optimize memcpy for AVX512 platforms

2016-01-27 Thread Ananyev, Konstantin
Hi Thomas,

> -Original Message-
> From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com]
> Sent: Wednesday, January 27, 2016 3:31 PM
> To: Richardson, Bruce; Ananyev, Konstantin
> Cc: dev at dpdk.org; Wang, Zhihong
> Subject: Re: [dpdk-dev] [PATCH v2 0/5] Optimize memcpy for AVX512 platforms
> 
> > Zhihong Wang (5):
> >   lib/librte_eal: Identify AVX512 CPU flag
> >   mk: Predefine AVX512 macro for compiler
> >   lib/librte_eal: Optimize memcpy for AVX512 platforms
> >   app/test: Adjust alignment unit for memcpy perf test
> >   lib/librte_eal: Tune memcpy for prior platforms
> >
> >  app/test/test_memcpy_perf.c|   6 +
> >  .../common/include/arch/x86/rte_cpuflags.h |   2 +
> >  .../common/include/arch/x86/rte_memcpy.h   | 269 
> > -
> >  mk/rte.cpuflags.mk |   4 +
> >  4 files changed, 268 insertions(+), 13 deletions(-)
> 
> The maintainers of arch/x86 are Bruce and Konstantin.
> I guess there is no comment and we can apply this cool series?

Yes, looks ok to me.
Konstantin


[dpdk-dev] DPDK bnx2x driver link problem

2016-01-27 Thread Saurabh Mishra
Looks like bnx2x has link problem?sometime it sees link up and most of the
time it see link down even though the RX/TX counters are going up.

Has anybody seen this type of problem? If I don't use DPDK then I don't see
this type of link related problem.

The counter shows that it?s receiving and transmitting packets.

I tried with ?n1, -n2 and ?n4.



*1st Time:*


[root@ ~]# ./symmetric_mp fakeelf -c 2 -m2048 -n4 --proc-type=primary -- -p
3 --num-procs=2 --proc-id=0


PMD: bnx2x_print_adapter_info(): Switch : 0


PMD: bnx2x_print_adapter_info(): ===



Checking link
status..done

Port 0 Link Up - speed 1 Mbps - full-duplex

Port 1 Link Down

APP: Finished Process Init.

Lcore 1 using ports 0 1

lcore 1 using queue 0 of each port

^C

Exiting on signal 2


Port 0: RX - 3, TX - 27, Drop - 0

Port 1: RX - 27, TX - 3, Drop - 0




*Second time:*


[root@ ~]# ./symmetric_mp fakeelf -c 2 -m2048 -n4 --proc-type=primary -- -p
3 --num-procs=2 --proc-id=0



PMD: bnx2x_print_adapter_info(): Switch : 0


PMD: bnx2x_print_adapter_info(): ===




Checking link
status..done

Port 0 Link Down

Port 1 Link Down

APP: Finished Process Init.

Lcore 1 using ports 0 1

lcore 1 using queue 0 of each port

^C

Exiting on signal 2


Port 0: RX - 3, TX - 17, Drop - 0

Port 1: RX - 17, TX - 3, Drop - 0

[root@ ~]#



*Third time:*



[root@ ~]# ./symmetric_mp fakeelf -c 2 -m2048 -n4 --proc-type=primary -- -p
3 --num-procs=2 --proc-id=0


Checking link
status..done

Port 0 Link Down

Port 1 Link Down

APP: Finished Process Init.

Lcore 1 using ports 0 1

lcore 1 using queue 0 of each port

^C

Exiting on signal 2


Port 0: RX - 8, TX - 84, Drop - 0

Port 1: RX - 84, TX - 8, Drop - 0


[root@~]#



*4th time:*


Checking link
status..done

Port 0 Link Down

Port 1 Link Down

APP: Finished Process Init.

Lcore 1 using ports 0 1

lcore 1 using queue 0 of each port

^C

Exiting on signal 2


Port 0: RX - 2, TX - 14, Drop - 0

Port 1: RX - 14, TX - 2, Drop - 0


[root@ ~]#


[dpdk-dev] [PATCH 4/4] examples/ip_pipeline: add packets dumping to PCAP file support

2016-01-27 Thread Fan Zhang
This patch add packet dumping feature to ip_pipeline. Output port type
SINK now supports dumping packets to PCAP file before releasing mbuf back
to mempool. This feature can be applied by specifying parameters in
configuration file as shown below:

[PIPELINE1]
type = PASS-THROUGH
core = 1
pktq_in = SOURCE0 SOURCE1
pktq_out = SINK0 SINK1
pcap_file_wr = /path/to/eth1.pcap /path/to/eth2.pcap
pcap_n_pkt_wr = 80 0

The configuration section "pcap_file_wr" contains full path and name of
the PCAP file which the packets will be dumped to. If multiple SINKs
exists, each shall have its own PCAP file path listed in this section,
separated by spaces. Multiple SINK ports shall NOT share same PCAP file to
be dumped.

The configuration section "pcap_n_pkt_wr" contains integer value(s)
and indicates the maximum number of packets to be dumped to the PCAP file.
If this value is "0", the "infinite" dumping mode will be used. If this
value is N (N > 0), the dumping will be finished when the number of
packets dumped to the file reaches N.

To enable PCAP dumping support to IP pipeline, the compiler option
CONFIG_RTE_PORT_PCAP must be set to 'y'. It is possible to disable this
feature by removing "pcap_file_wr" and "pcap_n_pkt_wr" lines from the
configuration file.

Signed-off-by: Fan Zhang 
Acked-by: Cristian Dumitrescu 
---
 examples/ip_pipeline/app.h  |   2 +
 examples/ip_pipeline/config_parse.c | 159 
 examples/ip_pipeline/init.c |  11 +++
 examples/ip_pipeline/pipeline_be.h  |   2 +
 4 files changed, 174 insertions(+)

diff --git a/examples/ip_pipeline/app.h b/examples/ip_pipeline/app.h
index 9dbe668..144fab8 100644
--- a/examples/ip_pipeline/app.h
+++ b/examples/ip_pipeline/app.h
@@ -155,6 +155,8 @@ struct app_pktq_source_params {
 struct app_pktq_sink_params {
char *name;
uint8_t parsed;
+   char *file_name; /* Full path of PCAP file to be copied to mbufs */
+   uint32_t n_pkts_to_dump;
 };

 struct app_msgq_params {
diff --git a/examples/ip_pipeline/config_parse.c 
b/examples/ip_pipeline/config_parse.c
index f0bed81..9f5b974 100644
--- a/examples/ip_pipeline/config_parse.c
+++ b/examples/ip_pipeline/config_parse.c
@@ -184,6 +184,8 @@ struct app_pktq_source_params default_source_params = {

 struct app_pktq_sink_params default_sink_params = {
.parsed = 0,
+   .file_name = NULL,
+   .n_pkts_to_dump = 0,
 };

 struct app_msgq_params default_msgq_params = {
@@ -1003,6 +1005,83 @@ parse_pipeline_pcap_source(struct app_params *app,
 }

 static int
+parse_pipeline_pcap_sink(struct app_params *app,
+   struct app_pipeline_params *p,
+   const char *file_name, const char *n_pkts_to_dump)
+{
+   const char *next = NULL;
+   char *end;
+   uint32_t i;
+   int parse_file = 0;
+
+   if (file_name && !n_pkts_to_dump) {
+   next = file_name;
+   parse_file = 1; /* parse file path */
+   } else if (n_pkts_to_dump && !file_name) {
+   next = n_pkts_to_dump;
+   parse_file = 0; /* parse copy size */
+   } else
+   return -EINVAL;
+
+   char name[APP_PARAM_NAME_SIZE];
+   size_t name_len;
+
+   if (p->n_pktq_out == 0)
+   return -EINVAL;
+
+   for (i = 0; i < p->n_pktq_out; i++) {
+   if (p->pktq_out[i].type != APP_PKTQ_OUT_SINK)
+   return -EINVAL;
+   }
+
+   i = 0;
+   while (*next != '\0') {
+   uint32_t id;
+
+   if (i >= p->n_pktq_out)
+   return -EINVAL;
+
+   id = p->pktq_out[i].id;
+
+   end = strchr(next, ' ');
+   if (!end)
+   name_len = strlen(next);
+   else
+   name_len = end - next;
+
+   if (name_len == 0 || name_len == sizeof(name))
+   return -EINVAL;
+
+   strncpy(name, next, name_len);
+   name[name_len] = '\0';
+   next += name_len;
+   if (*next != '\0')
+   next++;
+
+   if (parse_file) {
+   app->sink_params[id].file_name = strdup(name);
+   if (app->sink_params[id].file_name == NULL)
+   return -ENOMEM;
+   } else {
+   if (parser_read_uint32(
+   >sink_params[id].n_pkts_to_dump,
+   name) != 0) {
+   if (app->sink_params[id].file_name != NULL)
+   free(app->sink_params[id].file_name);
+   return -EINVAL;
+   }
+   }
+
+   i++;
+
+   if (i == p->n_pktq_out)
+   return 0;
+   }
+
+   return -EINVAL;
+}
+
+static int
 parse_pipeline_pktq_in(struct app_params *app,
struct 

[dpdk-dev] [PATCH 3/4] lib/librte_port: add packet dumping to PCAP file support in sink port

2016-01-27 Thread Fan Zhang
Originally, sink ports in librte_port releases received mbufs back to
mempool. This patch adds optional packet dumping to PCAP feature in sink
port: the packets will be dumped to user defined PCAP file for storage or
debugging. The user may also choose the sink port's activity: either it
continuously dump the packets to the file, or stops at certain dumping

This feature shares same CONFIG_RTE_PORT_PCAP compiler option as source
port PCAP file support feature. Users can enable or disable this feature
by setting CONFIG_RTE_PORT_PCAP compiler option "y" or "n".

Signed-off-by: Fan Zhang 
Acked-by: Cristian Dumitrescu 
---
 lib/librte_port/rte_port_source_sink.c | 268 +++--
 lib/librte_port/rte_port_source_sink.h |  11 +-
 2 files changed, 263 insertions(+), 16 deletions(-)

diff --git a/lib/librte_port/rte_port_source_sink.c 
b/lib/librte_port/rte_port_source_sink.c
index 44fc0d5..2878014 100644
--- a/lib/librte_port/rte_port_source_sink.c
+++ b/lib/librte_port/rte_port_source_sink.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 

 #ifdef RTE_PORT_PCAP
 #include 
@@ -345,12 +346,183 @@ rte_port_source_stats_read(void *port,

 struct rte_port_sink {
struct rte_port_out_stats stats;
+
+   /* PCAP dumper handle and pkts number */
+   void *dumper;
+   uint32_t max_pkts;
+   uint32_t pkt_index;
+   uint32_t dump_finish;
 };

+#ifdef RTE_PORT_PCAP
+
+/**
+ * Open PCAP file for dumping packets to the file later
+ *
+ * @param port
+ *   Handle to sink port
+ * @param p
+ *   Sink port parameter
+ * @return
+ *   0 on SUCCESS
+ *   error code otherwise
+ */
+static int
+pcap_sink_open(struct rte_port_sink *port,
+   __rte_unused struct rte_port_sink_params *p)
+{
+   pcap_t *tx_pcap;
+   pcap_dumper_t *pcap_dumper;
+
+   if (p->file_name == NULL) {
+   port->dumper = NULL;
+   port->max_pkts = 0;
+   port->pkt_index = 0;
+   port->dump_finish = 0;
+   return 0;
+   }
+
+   /** Open a dead pcap handler for opening dumper file */
+   tx_pcap = pcap_open_dead(DLT_EN10MB, 65535);
+   if (tx_pcap == NULL)
+   return -1;
+
+   /* The dumper is created using the previous pcap_t reference */
+   pcap_dumper = pcap_dump_open(tx_pcap, p->file_name);
+   if (pcap_dumper == NULL)
+   return -1;
+
+   port->dumper = pcap_dumper;
+   port->max_pkts = p->max_n_pkts;
+   port->pkt_index = 0;
+   port->dump_finish = 0;
+
+   return 0;
+}
+
+uint8_t jumbo_pkt_buf[ETHER_MAX_JUMBO_FRAME_LEN];
+
+/**
+ * Dump a packet to PCAP dumper
+ *
+ * @param p
+ *   Handle to sink port
+ * @param mbuf
+ *   Handle to mbuf structure holding the packet
+ */
+static void
+pcap_sink_dump_pkt(struct rte_port_sink *port, struct rte_mbuf *mbuf)
+{
+   uint8_t *pcap_dumper = (uint8_t *)(port->dumper);
+   struct pcap_pkthdr pcap_hdr;
+   uint8_t *pkt;
+
+   /* Maximum num packets already reached */
+   if (port->dump_finish)
+   return;
+
+   pkt = rte_pktmbuf_mtod(mbuf, uint8_t *);
+
+   pcap_hdr.len = mbuf->pkt_len;
+   pcap_hdr.caplen = pcap_hdr.len;
+   gettimeofday(&(pcap_hdr.ts), NULL);
+
+   if (mbuf->nb_segs > 1) {
+   struct rte_mbuf *jumbo_mbuf;
+   uint32_t pkt_index = 0;
+
+   /* if packet size longer than ETHER_MAX_JUMBO_FRAME_LEN,
+* ignore it.
+*/
+   if (mbuf->pkt_len > ETHER_MAX_JUMBO_FRAME_LEN)
+   return;
+
+   for (jumbo_mbuf = mbuf; jumbo_mbuf != NULL;
+   jumbo_mbuf = jumbo_mbuf->next) {
+   rte_memcpy(_pkt_buf[pkt_index],
+   rte_pktmbuf_mtod(jumbo_mbuf, uint8_t *),
+   jumbo_mbuf->data_len);
+   pkt_index += jumbo_mbuf->data_len;
+   }
+
+   jumbo_pkt_buf[pkt_index] = '\0';
+
+   pkt = jumbo_pkt_buf;
+   }
+
+   pcap_dump(pcap_dumper, _hdr, pkt);
+
+   port->pkt_index++;
+
+   if ((port->max_pkts != 0) && (port->pkt_index >= port->max_pkts)) {
+   port->dump_finish = 1;
+   RTE_LOG(INFO, PORT, "Dumped %u packets to file\n",
+   port->pkt_index);
+   }
+
+}
+
+/**
+ * Flush pcap dumper
+ *
+ * @param dumper
+ *   Handle to pcap dumper
+ */
+
+static void
+pcap_sink_flush_pkt(void *dumper)
+{
+   pcap_dumper_t *pcap_dumper = (pcap_dumper_t *)dumper;
+
+   pcap_dump_flush(pcap_dumper);
+}
+
+/**
+ * Close a PCAP dumper handle
+ *
+ * @param dumper
+ *   Handle to pcap dumper
+ */
+static void
+pcap_sink_close(void *dumper)
+{
+   pcap_dumper_t *pcap_dumper = (pcap_dumper_t *)dumper;
+
+   pcap_dump_close(pcap_dumper);
+}
+
+#else
+
+static int
+pcap_sink_open(struct rte_port_sink *port,
+   

[dpdk-dev] [PATCH 2/4] examples/ip_pipeline: add PCAP file support

2016-01-27 Thread Fan Zhang
This patch add PCAP file support to ip_pipeline. Input port type SOURCE
now supports loading specific PCAP file and sends the packets in it to
pipeline instance. The packets are then released by SINK output port. This
feature can be applied by specifying parameters in configuration file as
shown below;

[PIPELINE1]
type = PASS-THROUGH
core = 1
pktq_in = SOURCE0 SOURCE1
pktq_out = SINK0 SINK1
pcap_file_rd = /path/to/eth1.PCAP /path/to/eth2.PCAP
pcap_bytes_rd_per_pkt = 0 64

The configuration section "pcap_file_rd" contains full path and name of
the PCAP file to be loaded. If multiple SOURCEs exists, each shall have
its own PCAP file path listed in this section, separated by spaces.
Multiple SOURCE ports may share same PCAP file to be copied.

The configuration section "pcap_bytes_rd_per_pkt" contains integer value
and indicates the maximum number of bytes to be copied from each packet
in the PCAP file. If this value is "0", all packets in the file will be
copied fully; if the packet size is smaller than the assigned value, the
entire packet is copied. Same as "pcap_file_rd", every SOURCE shall have
its own maximum copy byte number.

To enable PCAP support to IP pipeline, the compiler option
CONFIG_RTE_PORT_PCAP must be set to 'y'. It is possible to disable PCAP
support by removing "pcap_file_rd" and "pcap_bytes_rd_per_pkt" lines
from the configuration file.

Signed-off-by: Fan Zhang 
Acked-by: Cristian Dumitrescu 
---
 examples/ip_pipeline/app.h  |   2 +
 examples/ip_pipeline/config_parse.c | 102 +++-
 examples/ip_pipeline/init.c |   8 +++
 3 files changed, 110 insertions(+), 2 deletions(-)

diff --git a/examples/ip_pipeline/app.h b/examples/ip_pipeline/app.h
index 6510d6d..9dbe668 100644
--- a/examples/ip_pipeline/app.h
+++ b/examples/ip_pipeline/app.h
@@ -148,6 +148,8 @@ struct app_pktq_source_params {
uint32_t parsed;
uint32_t mempool_id; /* Position in the app->mempool_params array */
uint32_t burst;
+   char *file_name; /* Full path of PCAP file to be copied to mbufs */
+   uint32_t n_bytes_per_pkt;
 };

 struct app_pktq_sink_params {
diff --git a/examples/ip_pipeline/config_parse.c 
b/examples/ip_pipeline/config_parse.c
index 1bedbe4..f0bed81 100644
--- a/examples/ip_pipeline/config_parse.c
+++ b/examples/ip_pipeline/config_parse.c
@@ -178,6 +178,8 @@ struct app_pktq_source_params default_source_params = {
.parsed = 0,
.mempool_id = 0,
.burst = 32,
+   .file_name = NULL,
+   .n_bytes_per_pkt = 0,
 };

 struct app_pktq_sink_params default_sink_params = {
@@ -924,6 +926,83 @@ parse_eal(struct app_params *app,
 }

 static int
+parse_pipeline_pcap_source(struct app_params *app,
+   struct app_pipeline_params *p,
+   const char *file_name, const char *cp_size)
+{
+   const char *next = NULL;
+   char *end;
+   uint32_t i;
+   int parse_file = 0;
+
+   if (file_name && !cp_size) {
+   next = file_name;
+   parse_file = 1; /* parse file path */
+   } else if (cp_size && !file_name) {
+   next = cp_size;
+   parse_file = 0; /* parse copy size */
+   } else
+   return -EINVAL;
+
+   char name[APP_PARAM_NAME_SIZE];
+   size_t name_len;
+
+   if (p->n_pktq_in == 0)
+   return -EINVAL;
+
+   for (i = 0; i < p->n_pktq_in; i++) {
+   if (p->pktq_in[i].type != APP_PKTQ_IN_SOURCE)
+   return -EINVAL;
+   }
+
+   i = 0;
+   while (*next != '\0') {
+   uint32_t id;
+
+   if (i >= p->n_pktq_in)
+   return -EINVAL;
+
+   id = p->pktq_in[i].id;
+
+   end = strchr(next, ' ');
+   if (!end)
+   name_len = strlen(next);
+   else
+   name_len = end - next;
+
+   if (name_len == 0 || name_len == sizeof(name))
+   return -EINVAL;
+
+   strncpy(name, next, name_len);
+   name[name_len] = '\0';
+   next += name_len;
+   if (*next != '\0')
+   next++;
+
+   if (parse_file) {
+   app->source_params[id].file_name = strdup(name);
+   if (app->source_params[id].file_name == NULL)
+   return -ENOMEM;
+   } else {
+   if (parser_read_uint32(
+   >source_params[id].n_bytes_per_pkt,
+   name) != 0) {
+   if (app->source_params[id].file_name != NULL)
+   free(app->source_params[id].file_name);
+   return -EINVAL;
+   }
+   }
+
+   i++;
+
+   if (i == p->n_pktq_in)
+   return 0;
+   }
+
+ 

[dpdk-dev] [PATCH 0/4] Add PCAP support to source and sink port

2016-01-27 Thread Fan Zhang
This patchset adds feature to source and sink type port in librte_port
library, and to examples/ip_pipline. Originally, source/sink ports act
as input and output of NULL packets generator. This patchset enables
them read from and write to specific PCAP file, to generate and dump
packets.

Acked-by: Cristian Dumitrescu 

Fan Zhang (4):
  lib/librte_port: add PCAP file support to source port
  example/ip_pipeline: add PCAP file support
  lib/librte_port: add packet dumping to PCAP file support in sink port
  examples/ip_pipeline: add packets dumping to PCAP file support

 config/common_bsdapp   |   1 +
 config/common_linuxapp |   1 +
 examples/ip_pipeline/app.h |   4 +
 examples/ip_pipeline/config_parse.c| 261 ++-
 examples/ip_pipeline/init.c|  19 ++
 examples/ip_pipeline/pipeline_be.h |   2 +
 lib/librte_port/Makefile   |   4 +
 lib/librte_port/rte_port_source_sink.c | 458 +++--
 lib/librte_port/rte_port_source_sink.h |  18 +-
 mk/rte.app.mk  |   1 +
 10 files changed, 751 insertions(+), 18 deletions(-)

-- 
2.5.0



[dpdk-dev] [PATCH 1/2] examples/ip_pipeline: CPU utilization measurement and

2016-01-27 Thread Fan Zhang
This patch adds CPU utilization measurement and rate computation to
packet framework. The measurement is done by measuring the cycles
spent while a thread pulls zero packet from RX queue. These cycles are
treated as idle cycles (or headroom). The idle thread rate is updated once
per second.

Signed-off-by: Fan Zhang 
Acked-by: Cristian Dumitrescu 
---
 examples/ip_pipeline/app.h|  7 
 examples/ip_pipeline/init.c   |  5 +++
 examples/ip_pipeline/thread.c | 81 +--
 examples/ip_pipeline/thread.h | 13 +++
 4 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/examples/ip_pipeline/app.h b/examples/ip_pipeline/app.h
index 6510d6d..2b134f1 100644
--- a/examples/ip_pipeline/app.h
+++ b/examples/ip_pipeline/app.h
@@ -263,6 +263,11 @@ struct app_thread_data {

struct rte_ring *msgq_in;
struct rte_ring *msgq_out;
+
+   uint64_t time_updated;
+   uint64_t hz;
+   uint64_t headroom;
+   double headroom_rate;
 };

 struct app_eal_params {
@@ -421,6 +426,8 @@ struct app_eal_params {
 #define APP_MAX_CMDS 64
 #endif

+#define APP_THREAD_HEADROOM_STATS_COLLECT
+
 struct app_params {
/* Config */
char app_name[APP_APPNAME_SIZE];
diff --git a/examples/ip_pipeline/init.c b/examples/ip_pipeline/init.c
index 186ca03..f4c1239 100644
--- a/examples/ip_pipeline/init.c
+++ b/examples/ip_pipeline/init.c
@@ -1379,6 +1379,11 @@ app_init_threads(struct app_params *app)
t->timer_period = (rte_get_tsc_hz() * APP_THREAD_TIMER_PERIOD) 
/ 1000;
t->thread_req_deadline = time + t->timer_period;

+   t->headroom = 0;
+   t->headroom_rate = 0.0;
+   t->time_updated = time;
+   t->hz = rte_get_tsc_hz();
+
t->msgq_in = app_thread_msgq_in_get(app,
params->socket_id,
params->core_id,
diff --git a/examples/ip_pipeline/thread.c b/examples/ip_pipeline/thread.c
index 78f1bd8..0e37a26 100644
--- a/examples/ip_pipeline/thread.c
+++ b/examples/ip_pipeline/thread.c
@@ -39,6 +39,36 @@
 #include "app.h"
 #include "thread.h"

+#ifdef APP_THREAD_HEADROOM_STATS_COLLECT
+
+static void
+thread_headroom_measure_start(uint64_t *t0)
+{
+   *t0 = rte_rdtsc();
+}
+
+static void
+thread_headroom_measure_stop(int n_pkts,
+   uint64_t t0, struct app_thread_data *t)
+{
+   if (n_pkts == 0) {
+   uint64_t t1 = rte_rdtsc();
+
+   t->headroom += t1 - t0;
+   }
+}
+
+#else
+
+static void
+thread_headroom_measure_start(uint64_t *t0) {}
+
+static void
+thread_headroom_measure_stop(int n_pkts,
+   uint64_t t0, struct app_thread_data *t) {}
+
+#endif
+
 static inline void *
 thread_msg_recv(struct rte_ring *r)
 {
@@ -140,6 +170,17 @@ thread_pipeline_disable(struct app_thread_data *t,
 }

 static int
+thread_headroom(struct app_thread_data *t,
+   void *req)
+{
+   struct thread_show_headroom_msg_rsp *rsp = req;
+
+   rsp->headroom = t->headroom_rate;
+
+   return 0;
+}
+
+static int
 thread_msg_req_handle(struct app_thread_data *t)
 {
void *msg_ptr;
@@ -165,6 +206,14 @@ thread_msg_req_handle(struct app_thread_data *t)
thread_msg_send(t->msgq_out, rsp);
break;
}
+
+   case THREAD_MSG_REQ_HEADROOM: {
+   rsp->status = thread_headroom(t,
+   (struct thread_show_headroom_msg_req *) req);
+   thread_msg_send(t->msgq_out, rsp);
+   break;
+   }
+
default:
break;
}
@@ -187,15 +236,23 @@ app_thread(void *arg)
for (j = 0; j < n_regular; j++) {
struct app_thread_pipeline_data *data = >regular[j];
struct pipeline *p = data->be;
+   uint64_t t0;
+   int n_pkts;

-   rte_pipeline_run(p->p);
+   thread_headroom_measure_start();
+   n_pkts = rte_pipeline_run(p->p);
+   thread_headroom_measure_stop(n_pkts, t0, t);
}

/* Run custom pipelines */
for (j = 0; j < n_custom; j++) {
struct app_thread_pipeline_data *data = >custom[j];
+   uint64_t t0;
+   int n_pkts;

-   data->f_run(data->be);
+   thread_headroom_measure_start();
+   n_pkts = data->f_run(data->be);
+   thread_headroom_measure_stop(n_pkts, t0, t);
}

/* Timer */
@@ -252,6 +309,26 @@ app_thread(void *arg)
t_deadline = deadline;
}

+   /* Timer 

[dpdk-dev] [PATCH 0/2] Add CPU utilization to packet framework

2016-01-27 Thread Fan Zhang
This patchset adds CPU utilization rate computation and CLI command
support to packet framework. The thread idle rate is updated once per
second. User can use thread CLI command to display it.

Fan Zhang (2):
  examples/ip_pipeline: CPU utilization measurement and rate computation
  examples/ip_pipeline: add CLI command to display CPU utilization rate

 examples/ip_pipeline/app.h   |   7 +++
 examples/ip_pipeline/init.c  |   5 ++
 examples/ip_pipeline/thread.c|  81 +++-
 examples/ip_pipeline/thread.h|  13 +
 examples/ip_pipeline/thread_fe.c | 114 +++
 examples/ip_pipeline/thread_fe.h |   6 +++
 6 files changed, 224 insertions(+), 2 deletions(-)

-- 
2.5.0



[dpdk-dev] [PATCH v5 0/3] Handle SIGINT and SIGTERM in DPDK examples

2016-01-27 Thread Thomas Monjalon
2015-12-30 16:59, Zhihong Wang:
> Zhihong Wang (3):
>   app/test-pmd: Handle SIGINT and SIGTERM in testpmd
>   examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd
>   examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

Applied, thanks


[dpdk-dev] [RFC PATCH 5/5] virtio: Extend virtio-net PMD to support container environment

2016-01-27 Thread Xie, Huawei
On 1/21/2016 7:09 PM, Tetsuya Mukawa wrote:
> +qtest_find_pci_device(struct qtest_session *s, uint16_t bus, uint8_t device)
> +{
> + struct qtest_pci_device *dev;
> + uint32_t val;
> +
> + val = qtest_pci_inl(s, bus, device, 0, 0);
> + TAILQ_FOREACH(dev, >head, next) {
> + if (val == ((uint32_t)dev->device_id << 16 | dev->vendor_id)) {
> + dev->bus_addr = bus;
> + dev->device_addr = device;
> + return;
> + }
> +
> + }
> +}
> +
> +static int
> +qtest_init_pci_devices(struct qtest_session *s)
> +{
> + struct qtest_pci_device *dev;
> + uint16_t bus;
> + uint8_t device;
> + int ret;
> +
> + /* Find devices */
> + bus = 0;
> + do {
> + device = 0;
> + do {
> + qtest_find_pci_device(s, bus, device);
> + } while (device++ != NB_DEVICE - 1);
> + } while (bus++ != NB_BUS - 1);

Seems this scan of all the pci devices is very time consuming operation,
and each scan involves socket communication.
Do you measure how long it takes to do the pci devices initialization?

> +
> + /* Initialize devices */
> + TAILQ_FOREACH(dev, >head, next) {
> + ret = dev->init(s, dev);
> + if (ret != 0)
> + return ret;
> + }
> +
> + return 0;



[dpdk-dev] [PATCH v6 08/11] eal: pci: introduce RTE_KDRV_VFIO_NOIOMMUi driver mode

2016-01-27 Thread Thomas Monjalon
2016-01-27 21:02, Santosh Shukla:
> 1. virtio currently works for vfio+noiommu and likely will work for
> vfio+iommu in near future.
> 2. So remove __noiommu suffix and always use default.
> 3. Introduce vfio resource parsing global function, That function
> suppose to do parsing for default vfio case and for vfio-noiommu case.
> This function will be used by pmd drivers for resource parsing purpose
> example virtio.
> 
> Yuan won't be happy with 3) I guess, because he wanted to get rid of
> interface parsing from pmd driver.
> 
> Thomas, if 1/2/3/ addresses your concern then I'll spin the series,

I agree with 1/ and 2/.
Please, could you explain why 3/ is needed?


[dpdk-dev] [PATCH] ethdev: fix byte order inconsistence between fdir flow and mask

2016-01-27 Thread Jingjing Wu
Fixed issue of byte order in ethdev library that the structure
for setting fdir's mask and flow entry is inconsist and made
inputs of mask be in big endian.

fixes: 76c6f89e80d4 ("ixgbe: support new flow director masks")
   2d4c1a9ea2ac ("ethdev: add new flow director masks")

Reported-by: Yaacov Hazan 
Signed-off-by: Jingjing Wu 
---
 app/test-pmd/cmdline.c   |  6 ++---
 doc/guides/rel_notes/release_2_3.rst |  6 +
 drivers/net/ixgbe/ixgbe_fdir.c   | 47 ++--
 lib/librte_ether/rte_eth_ctrl.h  |  7 --
 4 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 73298c9..13194c9 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -8687,13 +8687,13 @@ cmd_flow_director_mask_parsed(void *parsed_result,
return;
}

-   mask->vlan_tci_mask = res->vlan_mask;
+   mask->vlan_tci_mask = rte_cpu_to_be_16(res->vlan_mask);
IPV4_ADDR_TO_UINT(res->ipv4_src, mask->ipv4_mask.src_ip);
IPV4_ADDR_TO_UINT(res->ipv4_dst, mask->ipv4_mask.dst_ip);
IPV6_ADDR_TO_ARRAY(res->ipv6_src, mask->ipv6_mask.src_ip);
IPV6_ADDR_TO_ARRAY(res->ipv6_dst, mask->ipv6_mask.dst_ip);
-   mask->src_port_mask = res->port_src;
-   mask->dst_port_mask = res->port_dst;
+   mask->src_port_mask = rte_cpu_to_be_16(res->port_src);
+   mask->dst_port_mask = rte_cpu_to_be_16(res->port_dst);
}

cmd_reconfig_device_queue(res->port_id, 1, 1);
diff --git a/doc/guides/rel_notes/release_2_3.rst 
b/doc/guides/rel_notes/release_2_3.rst
index 99de186..28d0f27 100644
--- a/doc/guides/rel_notes/release_2_3.rst
+++ b/doc/guides/rel_notes/release_2_3.rst
@@ -19,6 +19,10 @@ Drivers
 Libraries
 ~

+* ** fix byte order inconsistence between fdir flow and mask **
+
+  Fixed issue in ethdev library that the structure for setting
+  fdir's mask and flow entry is inconsist in byte order.

 Examples
 
@@ -39,6 +43,8 @@ API Changes
 ABI Changes
 ---

+* The fields in  The ethdev structures ``rte_eth_fdir_masks`` were
+  changed to be in big endian.

 Shared Library Versions
 ---
diff --git a/drivers/net/ixgbe/ixgbe_fdir.c b/drivers/net/ixgbe/ixgbe_fdir.c
index e03219b..7423b2d 100644
--- a/drivers/net/ixgbe/ixgbe_fdir.c
+++ b/drivers/net/ixgbe/ixgbe_fdir.c
@@ -309,6 +309,7 @@ fdir_set_input_mask_82599(struct rte_eth_dev *dev,
uint32_t fdiripv6m; /* IPv6 source and destination masks. */
uint16_t dst_ipv6m = 0;
uint16_t src_ipv6m = 0;
+   volatile uint32_t *reg;

PMD_INIT_FUNC_TRACE();

@@ -322,16 +323,16 @@ fdir_set_input_mask_82599(struct rte_eth_dev *dev,
/* use the L4 protocol mask for raw IPv4/IPv6 traffic */
fdirm |= IXGBE_FDIRM_L4P;

-   if (input_mask->vlan_tci_mask == 0x0FFF)
+   if (input_mask->vlan_tci_mask == rte_cpu_to_be_16(0x0FFF))
/* mask VLAN Priority */
fdirm |= IXGBE_FDIRM_VLANP;
-   else if (input_mask->vlan_tci_mask == 0xE000)
+   else if (input_mask->vlan_tci_mask == rte_cpu_to_be_16(0xE000))
/* mask VLAN ID */
fdirm |= IXGBE_FDIRM_VLANID;
else if (input_mask->vlan_tci_mask == 0)
/* mask VLAN ID and Priority */
fdirm |= IXGBE_FDIRM_VLANID | IXGBE_FDIRM_VLANP;
-   else if (input_mask->vlan_tci_mask != 0xEFFF) {
+   else if (input_mask->vlan_tci_mask != rte_cpu_to_be_16(0xEFFF)) {
PMD_INIT_LOG(ERR, "invalid vlan_tci_mask");
return -EINVAL;
}
@@ -340,19 +341,26 @@ fdir_set_input_mask_82599(struct rte_eth_dev *dev,
IXGBE_WRITE_REG(hw, IXGBE_FDIRM, fdirm);

/* store the TCP/UDP port masks, bit reversed from port layout */
-   fdirtcpm = reverse_fdir_bitmasks(input_mask->dst_port_mask,
-input_mask->src_port_mask);
+   fdirtcpm = reverse_fdir_bitmasks(
+   rte_be_to_cpu_16(input_mask->dst_port_mask),
+   rte_be_to_cpu_16(input_mask->src_port_mask));

-   /* write all the same so that UDP, TCP and SCTP use the same mask */
+   /* write all the same so that UDP, TCP and SCTP use the same mask
+* (little-endian)
+   */
IXGBE_WRITE_REG(hw, IXGBE_FDIRTCPM, ~fdirtcpm);
IXGBE_WRITE_REG(hw, IXGBE_FDIRUDPM, ~fdirtcpm);
IXGBE_WRITE_REG(hw, IXGBE_FDIRSCTPM, ~fdirtcpm);
info->mask.src_port_mask = input_mask->src_port_mask;
info->mask.dst_port_mask = input_mask->dst_port_mask;

-   /* Store source and destination IPv4 masks (big-endian) */
-   IXGBE_WRITE_REG(hw, IXGBE_FDIRSIP4M, ~(input_mask->ipv4_mask.src_ip));
-   IXGBE_WRITE_REG(hw, IXGBE_FDIRDIP4M, ~(input_mask->ipv4_mask.dst_ip));
+   /* Store source 

[dpdk-dev] [PATCH 2/2] kdp: add virtual PMD for kernel slow data path communication

2016-01-27 Thread Ferruh Yigit
This patch provides slow data path communication to the Linux kernel.
Patch is based on librte_kni, and heavily re-uses it.

The main difference is librte_kni library converted into a PMD, to
provide ease of use for applications.

Now any application can use slow path communication without any update
in application, because of existing eal support for virtual PMD.

Also this PMD supports two methods to send packets to the Linux, first
one is custom FIFO implementation with help of KDP kernel module, second
one is Linux in-kernel tun/tap support. PMD first checks for KDP kernel
module, if fails it tries to create and use a tap interface.

With FIFO method: PMD's rx_pkt_burst() get packets from FIFO,
and tx_pkt_burst() puts packet to the FIFO.
The corresponding Linux virtual network device driver code
also gets/puts packets from FIFO as they are coming from hardware.

With tun/tap method: no external kernel module required, PMD reads from
and writes packets to the tap interface file descriptor. Tap interface
has performance penalty against FIFO implementation.

Signed-off-by: Ferruh Yigit 
---
 config/common_linuxapp  |   1 +
 doc/guides/nics/pcap_ring.rst   | 125 -
 doc/guides/rel_notes/release_2_3.rst|   6 +
 drivers/net/Makefile|   3 +-
 drivers/net/kdp/Makefile|  61 
 drivers/net/kdp/rte_eth_kdp.c   | 481 
 drivers/net/kdp/rte_kdp.c   | 365 
 drivers/net/kdp/rte_kdp.h   | 126 +
 drivers/net/kdp/rte_kdp_fifo.h  |  91 ++
 drivers/net/kdp/rte_kdp_tap.c   |  96 +++
 drivers/net/kdp/rte_pmd_kdp_version.map |   4 +
 lib/librte_eal/common/include/rte_log.h |   3 +-
 mk/rte.app.mk   |   3 +-
 13 files changed, 1359 insertions(+), 6 deletions(-)
 create mode 100644 drivers/net/kdp/Makefile
 create mode 100644 drivers/net/kdp/rte_eth_kdp.c
 create mode 100644 drivers/net/kdp/rte_kdp.c
 create mode 100644 drivers/net/kdp/rte_kdp.h
 create mode 100644 drivers/net/kdp/rte_kdp_fifo.h
 create mode 100644 drivers/net/kdp/rte_kdp_tap.c
 create mode 100644 drivers/net/kdp/rte_pmd_kdp_version.map

diff --git a/config/common_linuxapp b/config/common_linuxapp
index 73c91d8..b9dec0c 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -322,6 +322,7 @@ CONFIG_RTE_LIBRTE_PMD_NULL=y
 #
 # Compile KDP PMD
 #
+CONFIG_RTE_LIBRTE_PMD_KDP=y
 CONFIG_RTE_KDP_KMOD=y
 CONFIG_RTE_KDP_PREEMPT_DEFAULT=y

diff --git a/doc/guides/nics/pcap_ring.rst b/doc/guides/nics/pcap_ring.rst
index 46aa3ac..78b7b61 100644
--- a/doc/guides/nics/pcap_ring.rst
+++ b/doc/guides/nics/pcap_ring.rst
@@ -28,11 +28,11 @@
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-Libpcap and Ring Based Poll Mode Drivers
-
+Software Poll Mode Drivers
+==

 In addition to Poll Mode Drivers (PMDs) for physical and virtual hardware,
-the DPDK also includes two pure-software PMDs. These two drivers are:
+the DPDK also includes pure-software PMDs. These drivers are:

 *   A libpcap -based PMD (librte_pmd_pcap) that reads and writes packets using 
libpcap,
 - both from files on disk, as well as from physical NIC devices using 
standard Linux kernel drivers.
@@ -40,6 +40,10 @@ the DPDK also includes two pure-software PMDs. These two 
drivers are:
 *   A ring-based PMD (librte_pmd_ring) that allows a set of software FIFOs 
(that is, rte_ring)
 to be accessed using the PMD APIs, as though they were physical NICs.

+*   A slow data path PMD (librte_pmd_kdp) that allows send/get packets to/from 
OS network
+stack as it is a physical NIC.
+
+
 .. note::

 The libpcap -based PMD is disabled by default in the build configuration 
files,
@@ -211,6 +215,121 @@ Multiple devices may be specified, separated by commas.
 Done.


+Kernel Data Path PMD
+
+
+Kernel Data Path (KDP) PMD is to communicate with OS network stack easily by 
application.
+
+.. code-block:: console
+
+./testpmd --vdev eth_kdp0 --vdev eth_kdp1 -- -i
+...
+Configuring Port 0 (socket 0)
+Port 0: 00:00:00:00:00:00
+Configuring Port 1 (socket 0)
+Port 1: 00:00:00:00:00:00
+Checking link statuses...
+Port 0 Link Up - speed 1 Mbps - full-duplex
+Port 1 Link Up - speed 1 Mbps - full-duplex
+Done
+
+KDP PMD supports two type of communication:
+
+* Custom FIFO implementation
+* tun/tap implementation
+
+Custom FIFO implementation gives more performance but requires KDP kernel 
module (rte_kdp.ko) inserted.
+
+By default FIFO communication has priority, if KDP kernel module is not 
inserted, tun/tap communication used.
+
+If KDP kernel module inserted, above testpmd command will create following 
virtual 

[dpdk-dev] [PATCH 1/2] kdp: add kernel data path kernel module

2016-01-27 Thread Ferruh Yigit
This kernel module is based on KNI module, but this one is stripped
version of it and only for data messages, no control functionality
provided.

FIFO implementation of the KNI is kept exact same, but ethtool related
code removed and virtual network management related code simplified.

This module contains kernel support to create network devices and
this module has a simple driver for virtual network device, the driver
simply puts/gets packets to/from FIFO instead of real hardware.

FIFO is created owned by userspace application, which is for this case
KDP PMD.

In long term this patch intends to replace the KNI and KNI will be
depreciated.

Signed-off-by: Ferruh Yigit 
---
 config/common_linuxapp |   8 +-
 lib/librte_eal/linuxapp/Makefile   |   5 +-
 lib/librte_eal/linuxapp/eal/Makefile   |   3 +-
 .../linuxapp/eal/include/exec-env/rte_kdp_common.h | 143 +
 lib/librte_eal/linuxapp/kdp/Makefile   |  56 ++
 lib/librte_eal/linuxapp/kdp/kdp_dev.h  |  82 +++
 lib/librte_eal/linuxapp/kdp/kdp_fifo.h |  91 
 lib/librte_eal/linuxapp/kdp/kdp_misc.c | 463 +
 lib/librte_eal/linuxapp/kdp/kdp_net.c  | 573 +
 9 files changed, 1421 insertions(+), 3 deletions(-)
 create mode 100644 
lib/librte_eal/linuxapp/eal/include/exec-env/rte_kdp_common.h
 create mode 100644 lib/librte_eal/linuxapp/kdp/Makefile
 create mode 100644 lib/librte_eal/linuxapp/kdp/kdp_dev.h
 create mode 100644 lib/librte_eal/linuxapp/kdp/kdp_fifo.h
 create mode 100644 lib/librte_eal/linuxapp/kdp/kdp_misc.c
 create mode 100644 lib/librte_eal/linuxapp/kdp/kdp_net.c

diff --git a/config/common_linuxapp b/config/common_linuxapp
index 74bc515..73c91d8 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -1,6 +1,6 @@
 #   BSD LICENSE
 #
-#   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+#   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
 #   All rights reserved.
 #
 #   Redistribution and use in source and binary forms, with or without
@@ -320,6 +320,12 @@ CONFIG_RTE_LIBRTE_PMD_XENVIRT=n
 CONFIG_RTE_LIBRTE_PMD_NULL=y

 #
+# Compile KDP PMD
+#
+CONFIG_RTE_KDP_KMOD=y
+CONFIG_RTE_KDP_PREEMPT_DEFAULT=y
+
+#
 # Do prefetch of packet data within PMD driver receive function
 #
 CONFIG_RTE_PMD_PACKET_PREFETCH=y
diff --git a/lib/librte_eal/linuxapp/Makefile b/lib/librte_eal/linuxapp/Makefile
index d9c5233..e3f91a7 100644
--- a/lib/librte_eal/linuxapp/Makefile
+++ b/lib/librte_eal/linuxapp/Makefile
@@ -1,6 +1,6 @@
 #   BSD LICENSE
 #
-#   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+#   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
 #   All rights reserved.
 #
 #   Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,9 @@ DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal
 ifeq ($(CONFIG_RTE_KNI_KMOD),y)
 DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += kni
 endif
+ifeq ($(CONFIG_RTE_KDP_KMOD),y)
+DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += kdp
+endif
 ifeq ($(CONFIG_RTE_LIBRTE_XEN_DOM0),y)
 DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += xen_dom0
 endif
diff --git a/lib/librte_eal/linuxapp/eal/Makefile 
b/lib/librte_eal/linuxapp/eal/Makefile
index 26eced5..ac72aea 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -1,6 +1,6 @@
 #   BSD LICENSE
 #
-#   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+#   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
 #   All rights reserved.
 #
 #   Redistribution and use in source and binary forms, with or without
@@ -116,6 +116,7 @@ CFLAGS_eal_thread.o += -Wno-return-type
 endif

 INC := rte_interrupts.h rte_kni_common.h rte_dom0_common.h
+INC += rte_kdp_common.h

 SYMLINK-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP)-include/exec-env := \
$(addprefix include/exec-env/,$(INC))
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kdp_common.h 
b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kdp_common.h
new file mode 100644
index 000..0c77f58
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kdp_common.h
@@ -0,0 +1,143 @@
+/*-
+ *   This file is provided under a dual BSD/LGPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GNU LESSER GENERAL PUBLIC LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2.1 of the GNU Lesser General Public License
+ *   as published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   Lesser General Public License for more details.
+ *
+ *   You should have 

[dpdk-dev] [PATCH 0/2] slow data path communication between DPDK port and Linux

2016-01-27 Thread Ferruh Yigit
This is slow data path communication implementation based on existing KNI.

Difference is: librte_kni converted into a PMD, kdp kernel module is almost
same except all control path functionality removed and some simplification done.

Motivation is to simplify slow path data communication.
Now any application can use this new PMD to send/get data to Linux kernel.

PMD supports two communication methods:

1) KDP kernel module
PMD initialization functions handles creating virtual interfaces (with help of
kdp kernel module) and created FIFO. FIFO is used to share data between
userspace and kernelspace. This is default method.

2) tun/tap module
When KDP module is not inserted, PMD creates tap interface and transfers
packets using tap interface.

In long term this patch intends to replace the KNI and KNI will be
depreciated.

Sample usage:
1) Transfer any packet received from NIC that bound to DPDK, to the Linux kernel

a) insert kdp kernel module
insmod build/kmod/rte_kdp.ko

b) bind NIC to the DPDK using dpdk_nic_bind.py

c) ./testpmd --vdev eth_kdp0

c1) testpmd show two ports, one of them physical, other virtual
...
Configuring Port 0 (socket 0)
Port 0: 00:00:00:00:00:00
Configuring Port 1 (socket 0)
...
Checking link statuses...
Port 0 Link Up - speed 1 Mbps - full-duplex
Port 1 Link Up - speed 1 Mbps - full-duplex
Done

c2) This will create "kdp0" Linux interface
$ ip l show kdp0
21: kdp0:  mtu 1500 qdisc noop state DOWN mode DEFAULT 
group default qlen 1000
link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff

d) Linux port can be used for data

d1)
$ ifconfig kdp0 1.0.0.2
$ ping 1.0.0.1
PING 1.0.0.1 (1.0.0.1) 56(84) bytes of data.
64 bytes from 1.0.0.1: icmp_seq=1 ttl=64 time=0.789 ms
64 bytes from 1.0.0.1: icmp_seq=2 ttl=64 time=0.881 ms

d2)
$ tcpdump -nn -i kdp0
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on kdp0, link-type EN10MB (Ethernet), capture size 262144 bytes
15:01:22.407506 IP 1.0.0.1 > 1.0.0.2: ICMP echo request, id 40016, seq 18, 
length 64
15:01:22.408521 IP 1.0.0.2 > 1.0.0.1: ICMP echo reply, id 40016, seq 18, length 
64



2) Data travels between virtual Linux interfaces pass from DPDK application,
application can alter data

a) insert kdp kernel module
insmod build/kmod/rte_kdp.ko

b) No physical NIC involved

c) ./testpmd --vdev eth_kdp0 --vdev eth_kdp1

c1) testpmd show two ports, both of them are virtual
...
Configuring Port 0 (socket 0)
Port 0: 00:00:00:00:00:00
Configuring Port 1 (socket 0)
Port 1: 00:00:00:00:00:00
Checking link statuses...
Port 0 Link Up - speed 1 Mbps - full-duplex
Port 1 Link Up - speed 1 Mbps - full-duplex
Done

c2) This will create "kdp0"  and "kdp1" Linux interfaces
$ ip l show kdp0; ip l show kdp1
22: kdp0:  mtu 1500 qdisc noop state DOWN mode DEFAULT 
group default qlen 1000
link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff
23: kdp1:  mtu 1500 qdisc noop state DOWN mode DEFAULT 
group default qlen 1000
link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff

d) Data travel between virtual ports pass from DPDK application
$ifconfig kdp0 1.0.0.1
$ifconfig kdp1 1.0.0.2

d1)
$ ping 1.0.0.1
PING 1.0.0.1 (1.0.0.1) 56(84) bytes of data.
64 bytes from 1.0.0.1: icmp_seq=1 ttl=64 time=3.57 ms
64 bytes from 1.0.0.1: icmp_seq=2 ttl=64 time=1.85 ms
64 bytes from 1.0.0.1: icmp_seq=3 ttl=64 time=1.89 ms

d2)
$ tcpdump -nn -i kdp0
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on kdp0, link-type EN10MB (Ethernet), capture size 262144 bytes
15:20:51.908543 IP 1.0.0.2 > 1.0.0.1: ICMP echo request, id 41234, seq 1, 
length 64
15:20:51.909570 IP 1.0.0.1 > 1.0.0.2: ICMP echo reply, id 41234, seq 1, length 
64
15:20:52.909551 IP 1.0.0.2 > 1.0.0.1: ICMP echo request, id 41234, seq 2, 
length 64
15:20:52.910577 IP 1.0.0.1 > 1.0.0.2: ICMP echo reply, id 41234, seq 2, length 
64



3) tun/tap interface usage

a) No external module required, tun/tap support in kernel required

b) ./testpmd --vdev eth_kdp0 --vdev eth_kdp1

b1) This will create "tap_kdp0"  and "tap_kdp1" Linux interfaces
$ ip l show tap_kdp0; ip l show tap_kdp1
25: tap_kdp0:  mtu 1500 qdisc noop state DOWN mode DEFAULT 
group default qlen 500
link/ether 56:47:97:9c:03:8e brd ff:ff:ff:ff:ff:ff
26: tap_kdp1:  mtu 1500 qdisc noop state DOWN mode DEFAULT 
group default qlen 500
link/ether 5e:15:22:b0:52:42 brd ff:ff:ff:ff:ff:ff

Ferruh Yigit (2):
  kdp: add kernel data path kernel module
  kdp: add virtual PMD for kernel slow data path communication

 config/common_linuxapp |   9 +-
 doc/guides/nics/pcap_ring.rst  | 125 -
 doc/guides/rel_notes/release_2_3.rst   |   6 +
 drivers/net/Makefile   |   3 +-
 drivers/net/kdp/Makefile   |  61 +++
 drivers/net/kdp/rte_eth_kdp.c  | 481 

[dpdk-dev] [PATCH v2 0/5] Optimize memcpy for AVX512 platforms

2016-01-27 Thread Thomas Monjalon
> Zhihong Wang (5):
>   lib/librte_eal: Identify AVX512 CPU flag
>   mk: Predefine AVX512 macro for compiler
>   lib/librte_eal: Optimize memcpy for AVX512 platforms
>   app/test: Adjust alignment unit for memcpy perf test
>   lib/librte_eal: Tune memcpy for prior platforms
> 
>  app/test/test_memcpy_perf.c|   6 +
>  .../common/include/arch/x86/rte_cpuflags.h |   2 +
>  .../common/include/arch/x86/rte_memcpy.h   | 269 
> -
>  mk/rte.cpuflags.mk |   4 +
>  4 files changed, 268 insertions(+), 13 deletions(-)

The maintainers of arch/x86 are Bruce and Konstantin.
I guess there is no comment and we can apply this cool series?


[dpdk-dev] [PATCH v4] vfio: Support for no-IOMMU mode

2016-01-27 Thread Burakov, Anatoly
Hi Thomas,

> > Is it possible (is it better) to declare these functions with 
> > vfio_dma_func_t?
> 
> Yeah, sure. Or maybe the other way around - maybe we could do away with
> the typedef. I'll go for the former though.

No, we can't declare the functions with a function pointer. At least I don't 
see any obvious way to do that without incurring multiple declarations compile 
error. So I'll leave it as forward declarations. Of course, the other 
alternative is to put the array below the functions and make them static, to 
avoid forward declarations, but I think it's much clearer the way it is now.

Thanks,
Anatoly


[dpdk-dev] [PATCH 3/3] examples/ethtool: add control interface support to the application

2016-01-27 Thread Ferruh Yigit
Control interface APIs added into the sample application.

To have the support corresponding kernel module (KCP) needs to be inserted.
If kernel module is not there, application will run as it is without
kernel control path support.

When KCP module inserted, running application creates a virtual Linux
network interface (dpdk$) per DPDK port. This interface can be used by
traditional Linux tools.

Signed-off-by: Ferruh Yigit 
---
 doc/guides/sample_app_ug/ethtool.rst | 41 
 examples/ethtool/ethtool-app/main.c  | 10 +++--
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/doc/guides/sample_app_ug/ethtool.rst 
b/doc/guides/sample_app_ug/ethtool.rst
index 4d1697e..2174288 100644
--- a/doc/guides/sample_app_ug/ethtool.rst
+++ b/doc/guides/sample_app_ug/ethtool.rst
@@ -131,6 +131,47 @@ application`_. Individual call-back functions handle the 
detail
 associated with each command, which make use of the functions
 defined in the `Ethtool interface`_ to the DPDK functions.

+Control Interface
+~
+
+If Kernel Control Path (KCP) kernel module (rte_kcp.ko) inserted,
+virtual interfaces created for each DPDK port for control purposes.
+
+Created interfaces are named as dpdk#, like:
+
+.. code-block:: console
+
+# ifconfig dpdk0; ifconfig dpdk1
+dpdk0: flags=4099  mtu 1500
+ether 90:e2:ba:0e:49:b9  txqueuelen 1000  (Ethernet)
+RX packets 0  bytes 0 (0.0 B)
+RX errors 0  dropped 0  overruns 0  frame 0
+TX packets 0  bytes 0 (0.0 B)
+TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0
+
+dpdk1: flags=4099  mtu 1500
+ether 00:1b:21:76:fa:21  txqueuelen 1000  (Ethernet)
+RX packets 0  bytes 0 (0.0 B)
+RX errors 0  dropped 0  overruns 0  frame 0
+TX packets 0  bytes 0 (0.0 B)
+TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0
+
+Regular Linux commands can be issued on interfaces:
+
+.. code-block:: console
+
+# ethtool -i dpdk0
+driver: rte_ixgbe_pmd
+version: RTE 2.3.0-rc0
+firmware-version:
+expansion-rom-version:
+bus-info: :08:00.1
+supports-statistics: yes
+supports-test: no
+supports-eeprom-access: yes
+supports-register-dump: yes
+supports-priv-flags: no
+
 Ethtool interface
 -

diff --git a/examples/ethtool/ethtool-app/main.c 
b/examples/ethtool/ethtool-app/main.c
index e21abcd..68b13ad 100644
--- a/examples/ethtool/ethtool-app/main.c
+++ b/examples/ethtool/ethtool-app/main.c
@@ -1,7 +1,7 @@
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
  *   All rights reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 

 #include "ethapp.h"

@@ -54,7 +55,6 @@
 #define PKTPOOL_EXTRA_SIZE 512
 #define PKTPOOL_CACHE 32

-
 struct txq_port {
uint16_t cnt_unsent;
struct rte_mbuf *buf_frames[MAX_BURST_LENGTH];
@@ -254,6 +254,8 @@ static int slave_main(__attribute__((unused)) void 
*ptr_data)
}
rte_spinlock_unlock(_port->lock);
} /* end for( idx_port ) */
+   rte_eth_control_interface_process_msg(
+   RTE_ETHTOOL_CTRL_IF_PROCESS_MSG, 0);
} /* end for(;;) */

return 0;
@@ -293,6 +295,8 @@ int main(int argc, char **argv)
id_core = rte_get_next_lcore(id_core, 1, 1);
rte_eal_remote_launch(slave_main, NULL, id_core);

+   rte_eth_control_interface_create();
+
ethapp_main();

app_cfg.exit_now = 1;
@@ -301,5 +305,7 @@ int main(int argc, char **argv)
return -1;
}

+   rte_eth_control_interface_destroy();
+
return 0;
 }
-- 
2.5.0



[dpdk-dev] [PATCH 2/3] rte_ctrl_if: add control interface library

2016-01-27 Thread Ferruh Yigit
This library gets control messages form kernelspace and forwards them to
librte_ether and returns response back to the kernelspace.

Library does:
1) Trigger Linux virtual interface creation
2) Initialize the netlink socket communication
3) Provides process() API to the application that does processing the
received messages

This library requires corresponding kernel module to be inserted.

Signed-off-by: Ferruh Yigit 
---
 config/common_linuxapp |   3 +-
 doc/api/doxy-api-index.md  |   3 +-
 doc/api/doxy-api.conf  |   1 +
 doc/guides/rel_notes/release_2_3.rst   |   9 +
 lib/Makefile   |   3 +-
 lib/librte_ctrl_if/Makefile|  58 +
 lib/librte_ctrl_if/rte_ctrl_if.c   | 162 +
 lib/librte_ctrl_if/rte_ctrl_if.h   | 115 ++
 lib/librte_ctrl_if/rte_ctrl_if_version.map |   9 +
 lib/librte_ctrl_if/rte_ethtool.c   | 354 +
 lib/librte_ctrl_if/rte_ethtool.h   |  54 +
 lib/librte_ctrl_if/rte_nl.c| 259 +
 lib/librte_ctrl_if/rte_nl.h|  60 +
 lib/librte_eal/common/include/rte_log.h|   3 +-
 mk/rte.app.mk  |   3 +-
 15 files changed, 1091 insertions(+), 5 deletions(-)
 create mode 100644 lib/librte_ctrl_if/Makefile
 create mode 100644 lib/librte_ctrl_if/rte_ctrl_if.c
 create mode 100644 lib/librte_ctrl_if/rte_ctrl_if.h
 create mode 100644 lib/librte_ctrl_if/rte_ctrl_if_version.map
 create mode 100644 lib/librte_ctrl_if/rte_ethtool.c
 create mode 100644 lib/librte_ctrl_if/rte_ethtool.h
 create mode 100644 lib/librte_ctrl_if/rte_nl.c
 create mode 100644 lib/librte_ctrl_if/rte_nl.h

diff --git a/config/common_linuxapp b/config/common_linuxapp
index 5d5e3e4..f72ba0e 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -1,6 +1,6 @@
 #   BSD LICENSE
 #
-#   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+#   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
 #   All rights reserved.
 #
 #   Redistribution and use in source and binary forms, with or without
@@ -507,6 +507,7 @@ CONFIG_RTE_KNI_VHOST_DEBUG_TX=n
 #
 CONFIG_RTE_KCP_KMOD=y
 CONFIG_RTE_KCP_KO_DEBUG=n
+CONFIG_RTE_LIBRTE_CTRL_IF=y

 #
 # Compile vhost library
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 7a91001..214d16e 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -149,4 +149,5 @@ There are many libraries, so their headers may be grouped 
by topics:
   [common] (@ref rte_common.h),
   [ABI compat] (@ref rte_compat.h),
   [keepalive]  (@ref rte_keepalive.h),
-  [version](@ref rte_version.h)
+  [version](@ref rte_version.h),
+  [control interface]  (@ref rte_ctrl_if.h)
diff --git a/doc/api/doxy-api.conf b/doc/api/doxy-api.conf
index 57e8b5d..fd69bf1 100644
--- a/doc/api/doxy-api.conf
+++ b/doc/api/doxy-api.conf
@@ -39,6 +39,7 @@ INPUT   = doc/api/doxy-api-index.md \
   lib/librte_cmdline \
   lib/librte_compat \
   lib/librte_cryptodev \
+  lib/librte_ctrl_if \
   lib/librte_distributor \
   lib/librte_ether \
   lib/librte_hash \
diff --git a/doc/guides/rel_notes/release_2_3.rst 
b/doc/guides/rel_notes/release_2_3.rst
index 99de186..39725e4 100644
--- a/doc/guides/rel_notes/release_2_3.rst
+++ b/doc/guides/rel_notes/release_2_3.rst
@@ -4,6 +4,14 @@ DPDK Release 2.3
 New Features
 

+* **Control interface support added.**
+
+  To enable controlling DPDK ports by common Linux tools.
+  Following modules added to DPDK:
+
+  * librte_ctrl_if library
+  * librte_eal/linuxapp/kcp kernel module
+

 Resolved Issues
 ---
@@ -51,6 +59,7 @@ The libraries prepended with a plus sign were incremented in 
this version.
  librte_acl.so.2
  librte_cfgfile.so.2
  librte_cmdline.so.1
+   + librte_ctrl_if.so.1
  librte_distributor.so.1
  librte_eal.so.2
  librte_hash.so.2
diff --git a/lib/Makefile b/lib/Makefile
index ef172ea..a50bc1e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -1,6 +1,6 @@
 #   BSD LICENSE
 #
-#   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+#   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
 #   All rights reserved.
 #
 #   Redistribution and use in source and binary forms, with or without
@@ -58,6 +58,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_PORT) += librte_port
 DIRS-$(CONFIG_RTE_LIBRTE_TABLE) += librte_table
 DIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += librte_pipeline
 DIRS-$(CONFIG_RTE_LIBRTE_REORDER) += librte_reorder
+DIRS-$(CONFIG_RTE_LIBRTE_CTRL_IF) += librte_ctrl_if

 ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
 DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git 

[dpdk-dev] [PATCH 1/3] kcp: add kernel control path kernel module

2016-01-27 Thread Ferruh Yigit
This kernel module is based on KNI module, but this one is stripped
version of it and only for control messages, no data transfer
functionality provided.

This Linux kernel module helps userspace application create virtual
interfaces and when a control command issued into that virtual
interface, module pushes the command to the userspace and gets the
response back for the caller application.

The Linux tools like ethtool/ifconfig/ip can be used on virtual
interfaces but not ones for related data, like tcpdump.

In long term this patch intends to replace the KNI and KNI will be
depreciated.

Signed-off-by: Ferruh Yigit 
---
 config/common_linuxapp |   6 +
 lib/librte_eal/linuxapp/Makefile   |   5 +-
 lib/librte_eal/linuxapp/eal/Makefile   |   3 +-
 .../linuxapp/eal/include/exec-env/rte_kcp_common.h |  86 +++
 lib/librte_eal/linuxapp/kcp/Makefile   |  58 +
 lib/librte_eal/linuxapp/kcp/kcp_dev.h  |  65 +
 lib/librte_eal/linuxapp/kcp/kcp_ethtool.c  | 261 +++
 lib/librte_eal/linuxapp/kcp/kcp_misc.c | 282 +
 lib/librte_eal/linuxapp/kcp/kcp_net.c  | 209 +++
 lib/librte_eal/linuxapp/kcp/kcp_nl.c   | 194 ++
 10 files changed, 1167 insertions(+), 2 deletions(-)
 create mode 100644 
lib/librte_eal/linuxapp/eal/include/exec-env/rte_kcp_common.h
 create mode 100644 lib/librte_eal/linuxapp/kcp/Makefile
 create mode 100644 lib/librte_eal/linuxapp/kcp/kcp_dev.h
 create mode 100644 lib/librte_eal/linuxapp/kcp/kcp_ethtool.c
 create mode 100644 lib/librte_eal/linuxapp/kcp/kcp_misc.c
 create mode 100644 lib/librte_eal/linuxapp/kcp/kcp_net.c
 create mode 100644 lib/librte_eal/linuxapp/kcp/kcp_nl.c

diff --git a/config/common_linuxapp b/config/common_linuxapp
index 74bc515..5d5e3e4 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -503,6 +503,12 @@ CONFIG_RTE_KNI_VHOST_DEBUG_RX=n
 CONFIG_RTE_KNI_VHOST_DEBUG_TX=n

 #
+# Compile librte_ctrl_if
+#
+CONFIG_RTE_KCP_KMOD=y
+CONFIG_RTE_KCP_KO_DEBUG=n
+
+#
 # Compile vhost library
 # fuse-devel is needed to run vhost-cuse.
 # fuse-devel enables user space char driver development
diff --git a/lib/librte_eal/linuxapp/Makefile b/lib/librte_eal/linuxapp/Makefile
index d9c5233..d1fa3a3 100644
--- a/lib/librte_eal/linuxapp/Makefile
+++ b/lib/librte_eal/linuxapp/Makefile
@@ -1,6 +1,6 @@
 #   BSD LICENSE
 #
-#   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+#   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
 #   All rights reserved.
 #
 #   Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,9 @@ DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal
 ifeq ($(CONFIG_RTE_KNI_KMOD),y)
 DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += kni
 endif
+ifeq ($(CONFIG_RTE_KCP_KMOD),y)
+DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += kcp
+endif
 ifeq ($(CONFIG_RTE_LIBRTE_XEN_DOM0),y)
 DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += xen_dom0
 endif
diff --git a/lib/librte_eal/linuxapp/eal/Makefile 
b/lib/librte_eal/linuxapp/eal/Makefile
index 26eced5..dded8cb 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -1,6 +1,6 @@
 #   BSD LICENSE
 #
-#   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+#   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
 #   All rights reserved.
 #
 #   Redistribution and use in source and binary forms, with or without
@@ -116,6 +116,7 @@ CFLAGS_eal_thread.o += -Wno-return-type
 endif

 INC := rte_interrupts.h rte_kni_common.h rte_dom0_common.h
+INC += rte_kcp_common.h

 SYMLINK-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP)-include/exec-env := \
$(addprefix include/exec-env/,$(INC))
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kcp_common.h 
b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kcp_common.h
new file mode 100644
index 000..b3a6ee3
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kcp_common.h
@@ -0,0 +1,86 @@
+/*-
+ *   This file is provided under a dual BSD/LGPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GNU LESSER GENERAL PUBLIC LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2.1 of the GNU Lesser General Public License
+ *   as published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this program;
+ *
+ *   Contact Information:
+ *   Intel Corporation
+ *
+ *
+ *  

[dpdk-dev] [PATCH 0/3] Use common Linux tools to control DPDK ports

2016-01-27 Thread Ferruh Yigit
This work is to make DPDK ports more visible and to enable using common
Linux tools to configure DPDK ports.

Patch is based on KNI but contains only control functionality of it,
also this patch does not include any Linux kernel network driver as
part of it.

Basically with the help of a kernel module (KCP), virtual Linux network
interfaces named as "dpdk$" are created per DPDK port, control messages
sent to these virtual interfaces are forwarded to DPDK, and response
sent back to Linux application.

Virtual interfaces created when DPDK application started and destroyed
automatically when DPDK application terminated.

Communication between kernel-space and DPDK done using netlink socket.

Currently implementation is not complete, sample support added for the
RFC, more functionality can be added based on community response.

With this RFC Patch, supported: get/set mac address/mtu of DPDK devices,
getting stats from DPDK devices and some set of ethtool commands.

In long term this patch intends to replace the KNI and KNI will be
depreciated.

Samples:

$ ifconfig
dpdk0: flags=4099  mtu 1500
ether 90:e2:ba:0e:49:b8  txqueuelen 1000  (Ethernet)
RX packets 33  bytes 2058 (2.0 KiB)
RX errors 0  dropped 0  overruns 0  frame 0
TX packets 33  bytes 2058 (2.0 KiB)
TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

dpdk1: flags=4099  mtu 1500
ether 00:1b:21:76:fa:21  txqueuelen 1000  (Ethernet)
RX packets 0  bytes 0 (0.0 B)
RX errors 0  dropped 0  overruns 0  frame 0
TX packets 0  bytes 0 (0.0 B)
TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

After some traffic on port 0:

$ ifconfig
dpdk0: flags=4099  mtu 1500
ether 90:e2:ba:0e:49:77  txqueuelen 1000  (Ethernet)
RX packets 962  bytes 57798 (56.4 KiB)
RX errors 0  dropped 0  overruns 0  frame 0
TX packets 962  bytes 57798 (56.4 KiB)
TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0


$ ethtool -i dpdk0
driver: rte_ixgbe_pmd
version: RTE 2.3.0-rc0
firmware-version: 
expansion-rom-version: 
bus-info: :08:00.0
supports-statistics: yes
supports-test: no
supports-eeprom-access: yes
supports-register-dump: yes
supports-priv-flags: no


$ ip l show dpdk0
25: dpdk0:  mtu 1500 qdisc noop state DOWN 
mode DEFAULT group default qlen 1000
link/ether 90:e2:ba:0e:49:b8 brd ff:ff:ff:ff:ff:ff

$ ip l set dpdk0 addr 90:e2:ba:0e:49:77

$ ip l show dpdk0
25: dpdk0:  mtu 1500 qdisc noop state DOWN 
mode DEFAULT group default qlen 1000
link/ether 90:e2:ba:0e:49:77 brd ff:ff:ff:ff:ff:ff

Ferruh Yigit (3):
  kcp: add kernel control path kernel module
  rte_ctrl_if: add control interface library
  examples/ethtool: add control interface support to the application

 config/common_linuxapp |   9 +-
 doc/api/doxy-api-index.md  |   3 +-
 doc/api/doxy-api.conf  |   1 +
 doc/guides/rel_notes/release_2_3.rst   |   9 +
 doc/guides/sample_app_ug/ethtool.rst   |  41 +++
 examples/ethtool/ethtool-app/main.c|  10 +-
 lib/Makefile   |   3 +-
 lib/librte_ctrl_if/Makefile|  58 
 lib/librte_ctrl_if/rte_ctrl_if.c   | 162 ++
 lib/librte_ctrl_if/rte_ctrl_if.h   | 115 +++
 lib/librte_ctrl_if/rte_ctrl_if_version.map |   9 +
 lib/librte_ctrl_if/rte_ethtool.c   | 354 +
 lib/librte_ctrl_if/rte_ethtool.h   |  54 
 lib/librte_ctrl_if/rte_nl.c| 259 +++
 lib/librte_ctrl_if/rte_nl.h|  60 
 lib/librte_eal/common/include/rte_log.h|   3 +-
 lib/librte_eal/linuxapp/Makefile   |   5 +-
 lib/librte_eal/linuxapp/eal/Makefile   |   3 +-
 .../linuxapp/eal/include/exec-env/rte_kcp_common.h |  86 +
 lib/librte_eal/linuxapp/kcp/Makefile   |  58 
 lib/librte_eal/linuxapp/kcp/kcp_dev.h  |  65 
 lib/librte_eal/linuxapp/kcp/kcp_ethtool.c  | 261 +++
 lib/librte_eal/linuxapp/kcp/kcp_misc.c | 282 
 lib/librte_eal/linuxapp/kcp/kcp_net.c  | 209 
 lib/librte_eal/linuxapp/kcp/kcp_nl.c   | 194 +++
 mk/rte.app.mk  |   3 +-
 26 files changed, 2307 insertions(+), 9 deletions(-)
 create mode 100644 lib/librte_ctrl_if/Makefile
 create mode 100644 lib/librte_ctrl_if/rte_ctrl_if.c
 create mode 100644 lib/librte_ctrl_if/rte_ctrl_if.h
 create mode 100644 lib/librte_ctrl_if/rte_ctrl_if_version.map
 create mode 100644 lib/librte_ctrl_if/rte_ethtool.c
 create mode 100644 lib/librte_ctrl_if/rte_ethtool.h
 

[dpdk-dev] [PATCH v2 0/5] Optimize memcpy for AVX512 platforms

2016-01-27 Thread Thomas Monjalon
2016-01-17 22:05, Zhihong Wang:
> This patch set optimizes DPDK memcpy for AVX512 platforms, to make full
> utilization of hardware resources and deliver high performance.

On a related note, your expertise would be very valuable to review
these patches please:
(memcpy) http://dpdk.org/dev/patchwork/patch/4396/
(memcmp) http://dpdk.org/dev/patchwork/patch/4788/

Thanks


[dpdk-dev] [PATCH v6 08/11] eal: pci: introduce RTE_KDRV_VFIO_NOIOMMUi driver mode

2016-01-27 Thread Santosh Shukla
On Tue, Jan 26, 2016 at 9:51 PM, Santosh Shukla  wrote:
> On Tue, Jan 26, 2016 at 7:58 PM, Thomas Monjalon
>  wrote:
>> 2016-01-26 19:35, Santosh Shukla:
>>> On Tue, Jan 26, 2016 at 6:30 PM, Thomas Monjalon
>>>  wrote:
>>> > 2016-01-26 15:56, Santosh Shukla:
>>> >> In my observation, currently virtio work for vfio-noiommu, that's why
>>> >> said drv->kdrv need to know vfio mode.
>>> >
>>> > It is your observation. It may change in near future.
>>>
>>> so that mean till then, virtio support for non-x86 arch has to wait?
>>
>> No, absolutely not. virtio for non-x86 is welcome.
>>
>>> We have working model with vfio-noiommu, don't you think it make sense
>>> to let vfio_noiommu implementation exist and later in-case
>>> virtio+iommu gets mainline then switch to vfio __mode__ agnostic
>>> approach. And for that All it takes to replace __noiommu suffix with
>>> default.
>>
>> I'm just saying you should not touch the enum rte_kernel_driver.
>> RTE_KDRV_VFIO is a driver.
>> RTE_KDRV_VFIO_NOIOMMU is a mode.
>> As the VFIO API is the same in both modes, there is no reason to
>> distinguish them at this level.
>> Your patch adds the NOIOMMU case everywhere:
>> case RTE_KDRV_VFIO:
>> +   case RTE_KDRV_VFIO_NOIOMMU:
>>
>> I'll stop commenting here to let others give their opinion.
>>
>> [...]
>>> >> with vfio+iommu; binding virtio pci device to vfio-pci driver fail;
>>> >> giving below error:
>>> >> [   53.053464] VFIO - User Level meta-driver version: 0.3
>>> >> [   73.077805] vfio-pci: probe of :00:03.0 failed with error -22
>>> >> [   73.077852] vfio-pci: probe of :00:03.0 failed with error -22
>>> >>
>>> >> vfio_pci_probe() --> vfio_iommu_group_get() --> iommu_group_get()
>>> >> fails: iommu doesn't have group for virtio pci device.
>>> >
>>> > Yes it fails when binding.
>>> > So the later check in the virtio PMD is useless.
>>>
>>> Which check?
>>
>> The check for VFIO noiommu only:
>> -   if (dev->kdrv == RTE_KDRV_VFIO)
>> +   if (dev->kdrv == RTE_KDRV_VFIO_NOIOMMU)
>>
>> [...]
>>> > Furthermore restricting virtio to no-iommu mode doesn't bring
>>> > any improvement.
>>>
>>> We're not __restricting__, as soon as virtio+iommu gets working state,
>>> we'll simply replace __noiommu with default. Then its upto user to try
>>> out virtio with vfio default or vfio_noiommu.
>>
>> Yes it's up to user.
>> So your code should be
>> if (dev->kdrv == RTE_KDRV_VFIO)
>>
>
> Right,
>
>>> > That's why I suggest to keep the initial semantic of kdrv and
>>> > not pollute it with VFIO modes.
>>>
>>> I am okay to live with default and forget suffix __noiommu but there
>>> are implementation problem which was discussed in other thread
>>> - Virtio pmd driver should avoid interface parsing i.e.
>>> virtio_resource_init_uio/vfio() etc.. For vfio case - We could easily
>>> get rid of by moving /sys parsing to pci_eal layer, Right? If so then
>>> virtio currently works with vfio-noiommu, it make sense to me that
>>> pci_eal layer does parsing for pmd driver before that pmd driver get
>>> initialized.
>>
>> Please reword. What is the problem?
>>
>>> - Another case could be: iommu-less-pmd-driver. eal layer to do
>>> parsing before updating drv->kdrv.
>>
>> [...]
>>> >> >> > If a check is needed, I would prefer using your function
>>> >> >> > pci_vfio_is_noiommu() and remove driver modes from struct 
>>> >> >> > rte_kernel_driver.
>>> >> >>
>>> >> >> I don't think calling pci_vfio_no_iommu() inside
>>> >> >> virtio_reg_rd/wr_1/2/3() would be a good idea.
>>> >> >
>>> >> > Why? The value may be cached in the priv properties.
>>> >> >
>>> >> pci_vfio_is_noiommu() parses /sys for
>>> >> - enable_noiommu param
>>> >> - attached driver name is vfio-noiommu or not.
>>> >>
>>> >> It does file operation for that, I meant to say that calling this api
>>> >> within register_rd/wr function is not correct. It would be better if
>>> >> those low level register_rd/wr api only checks driver_types.
>>> >
>>> > Yes, that's why I said the return of pci_vfio_is_noiommu() may be cached
>>> > to keep efficiency.
>>>
>>> I am not convinced though, Still find pmd driver checking driver_types
>>> using drv->kdrv is better approach than introducing a new global
>>> variable which may look something like;
>>
>> Not a global variable. A function in EAL layer. A variable in PMD priv.
>>
>
> If we agreed to use condition (drv->kdrv == RTE_KDRV_VFIO);
> then resource parsing for vfio {including vfio and vfio_noiommu both
> case} is enforced in virtio pmd driver layer and that is contradicting
> to what we agreed earlier in this[1] thread. Also we don't need a
> function in EAL layer or a variable in PMD priv. Perhaps a private
> function in virtio pmd which does parsing for vfio interface.
>
> Thoughts?
>
> [1] http://dpdk.org/dev/patchwork/patch/9862/
>

Any comment/feedback on above approach?

>>> At pci_eal layer 
>>> bool vfio_mode;
>>> vfio_mode = pci_vfio_is_noiommu();
>>>
>>> At virtio pmd driver layer 
>>> Checking 

[dpdk-dev] [PATCH v2 0/2] minor cleanup in ethdev hotplug

2016-01-27 Thread Thomas Monjalon
2016-01-22 15:06, David Marchand:
> It was first a preparation step for future patchsets, but I am not sure what
> will become of them, so sending this anyway since it does not hurt to clean
> this now.
> 
> Changes since v1:
> - rebased on HEAD (previous patchset was based on another patch I sent
>   separately)
> - restored EINVAL error code for rte_eth_dev_(at|de)tach (thanks Jan)

Applied, thanks


[dpdk-dev] [PATCH v5 10/11] virtio: pci: add dummy func definition for in/outb for non-x86 arch

2016-01-27 Thread Santosh Shukla
Ping?

On Tue, Jan 19, 2016 at 5:16 PM, Santosh Shukla  wrote:
> For non-x86 arch, Compiler will throw build error for in/out apis. Including
> dummy api function so to pass build.
>
> Note that: For virtio to work for non-x86 arch - RTE_EAL_VFIO is the only
> supported method. RTE_EAL_IGB_UIO is not supported for non-x86 arch.
>
> So, Virtio support for arch and supported interface by that arch:
>
> ARCH   IGB_UIO  VFIO
> x86 Y   Y
> ARM64   N/A Y
> PPC_64  N/A Y   (Not tested but likely should work, as vfio is
> arch independent)
>
> Note: Applicable for virtio spec 0.95
>
> Signed-off-by: Santosh Shukla 
> ---
>  drivers/net/virtio/virtio_pci.h |   46 
> +++
>  1 file changed, 46 insertions(+)
>
> diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
> index f550d22..b88f9ec 100644
> --- a/drivers/net/virtio/virtio_pci.h
> +++ b/drivers/net/virtio/virtio_pci.h
> @@ -46,6 +46,7 @@
>  #endif
>
>  #include 
> +#include "virtio_logs.h"
>
>  struct virtqueue;
>
> @@ -320,6 +321,51 @@ outl_p(unsigned int data, unsigned int port)
>  }
>  #endif
>
> +#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_I686) && \
> +   defined(RTE_EXEC_ENV_LINUXAPP)
> +static inline uint8_t
> +inb(unsigned long addr __rte_unused)
> +{
> +   PMD_INIT_LOG(ERR, "inb() not supported for this RTE_ARCH\n");
> +   return 0;
> +}
> +
> +static inline uint16_t
> +inw(unsigned long addr __rte_unused)
> +{
> +   PMD_INIT_LOG(ERR, "inw() not supported for this RTE_ARCH\n");
> +   return 0;
> +}
> +
> +static inline uint32_t
> +inl(unsigned long addr __rte_unused)
> +{
> +   PMD_INIT_LOG(ERR, "in() not supported for this RTE_ARCH\n");
> +   return 0;
> +}
> +
> +static inline void
> +outb_p(unsigned char data __rte_unused, unsigned int port __rte_unused)
> +{
> +   PMD_INIT_LOG(ERR, "outb_p() not supported for this RTE_ARCH\n");
> +   return;
> +}
> +
> +static inline void
> +outw_p(unsigned short data __rte_unused, unsigned int port __rte_unused)
> +{
> +   PMD_INIT_LOG(ERR, "outw_p() not supported for this RTE_ARCH\n");
> +   return;
> +}
> +
> +static inline void
> +outl_p(unsigned int data __rte_unused, unsigned int port __rte_unused)
> +{
> +   PMD_INIT_LOG(ERR, "outl_p() not supported for this RTE_ARCH\n");
> +   return;
> +}
> +#endif
> +
>  static inline int
>  vtpci_with_feature(struct virtio_hw *hw, uint64_t bit)
>  {
> --
> 1.7.9.5
>


[dpdk-dev] [PATCH v4] vfio: Support for no-IOMMU mode

2016-01-27 Thread Burakov, Anatoly
Hi Thomas,

> > +/* DMA mapping function prototype.
> > + * Takes VFIO container fd as a parameter.
> > + * Returns 0 on success, -1 on error.
> > + * */
> > +typedef  int (*vfio_dma_func_t)(int);
> > +
> > +struct vfio_iommu_type {
> > +   int type_id;
> > +   const char *name;
> > +   vfio_dma_func_t dma_map_func;
> > +};
> > +
> > +int vfio_iommu_type1_dma_map(int);
> > +int vfio_iommu_noiommu_dma_map(int);
> 
> Is it possible (is it better) to declare these functions with vfio_dma_func_t?

Yeah, sure. Or maybe the other way around - maybe we could do away with the 
typedef. I'll go for the former though.

> vfio_iommu_noiommu_dma_map is a weird name.
> Why not vfio_noiommu_dma_map or vfio_iommu_none_dma_map?

Well, the NOIOMMU type is named VFIO_IOMMU_NOIOMMU in the VFIO headers. So it's 
consistent with the IOMMU type name. Although vfio_noiommu_dma_map seems 
reasonable.

Thanks,
Anatoly


[dpdk-dev] [PATCH V1 1/1] jobstats: added function abort for job

2016-01-27 Thread Jastrzebski, MichalX K
> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Panu Matilainen
> Sent: Wednesday, January 27, 2016 2:38 PM
> To: Kerlin, MarcinX ; dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH V1 1/1] jobstats: added function abort for job
> 
> On 01/26/2016 06:15 PM, Marcin Kerlin wrote:
> > This patch adds new function rte_jobstats_abort. It marks *job* as finished
> > and time of this work will be add to management time instead of execution
> time.
> > This function should be used instead of rte_jobstats_finish if condition
> occure,
> > condition is defined by the application for example when receiving n>0
> packets.
> >
> > Signed-off-by: Marcin Kerlin 
> > ---
> >   lib/librte_jobstats/rte_jobstats.c   | 22 ++
> >   lib/librte_jobstats/rte_jobstats.h   | 17 +
> >   lib/librte_jobstats/rte_jobstats_version.map |  7 +++
> >   3 files changed, 46 insertions(+)
> >
> [...]
> > diff --git a/lib/librte_jobstats/rte_jobstats.h
> b/lib/librte_jobstats/rte_jobstats.h
> > index de6a89a..9995319 100644
> > --- a/lib/librte_jobstats/rte_jobstats.h
> > +++ b/lib/librte_jobstats/rte_jobstats.h
> > @@ -90,6 +90,9 @@ struct rte_jobstats {
> > uint64_t exec_cnt;
> > /**< Execute count. */
> >
> > +   uint64_t last_job_time;
> > +   /**< Last job time */
> > +
> > char name[RTE_JOBSTATS_NAMESIZE];
> > /**< Name of this job */
> >
> 
> AFAICS this is an ABI break and as such, needs to be preannounced, see
> http://dpdk.org/doc/guides/contributing/versioning.html
> For 2.3 it'd need to be a CONFIG_RTE_NEXT_ABI feature.
> 
>   - Panu -

Hi Panu,
Thanks for Your notice. This last_job_time field is actually not necessary here
and will be removed from this structure.

Best regards
Michal 


[dpdk-dev] [PATCH v6 1/2] mbuf: provide rte_pktmbuf_alloc_bulk API

2016-01-27 Thread Panu Matilainen
On 01/26/2016 07:03 PM, Huawei Xie wrote:
> v6 changes:
>   reflect the changes in release notes and library version map file
>   revise our duff's code style a bit to make it more readable
>
> v5 changes:
>   add comment about duff's device and our variant implementation
>
> v3 changes:
>   move while after case 0
>   add context about duff's device and why we use while loop in the commit
> message
>
> v2 changes:
>   unroll the loop a bit to help the performance
>
> rte_pktmbuf_alloc_bulk allocates a bulk of packet mbufs.
>
> There is related thread about this bulk API.
> http://dpdk.org/dev/patchwork/patch/4718/
> Thanks to Konstantin's loop unrolling.
>
> Attached the wiki page about duff's device. It explains the performance
> optimization through loop unwinding, and also the most dramatic use of
> case label fall-through.
> https://en.wikipedia.org/wiki/Duff%27s_device
>
> In our implementation, we use while() loop rather than do{} while() loop
> because we could not assume count is strictly positive. Using while()
> loop saves one line of check if count is zero.
>
> Signed-off-by: Gerald Rogers 
> Signed-off-by: Huawei Xie 
> Acked-by: Konstantin Ananyev 
> ---
>   doc/guides/rel_notes/release_2_3.rst |  3 ++
>   lib/librte_mbuf/rte_mbuf.h   | 55 
> 
>   lib/librte_mbuf/rte_mbuf_version.map |  7 +
>   3 files changed, 65 insertions(+)
>
> diff --git a/doc/guides/rel_notes/release_2_3.rst 
> b/doc/guides/rel_notes/release_2_3.rst
> index 99de186..a52cba3 100644
> --- a/doc/guides/rel_notes/release_2_3.rst
> +++ b/doc/guides/rel_notes/release_2_3.rst
> @@ -4,6 +4,9 @@ DPDK Release 2.3
>   New Features
>   
>
> +* **Enable bulk allocation of mbufs. **
> +  A new function ``rte_pktmbuf_alloc_bulk()`` has been added to allow the 
> user
> +  to allocate a bulk of mbufs.
>
>   Resolved Issues
>   ---
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index f234ac9..b2ed479 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -1336,6 +1336,61 @@ static inline struct rte_mbuf 
> *rte_pktmbuf_alloc(struct rte_mempool *mp)
>   }
>
>   /**
> + * Allocate a bulk of mbufs, initialize refcnt and reset the fields to 
> default
> + * values.
> + *
> + *  @param pool
> + *The mempool from which mbufs are allocated.
> + *  @param mbufs
> + *Array of pointers to mbufs
> + *  @param count
> + *Array size
> + *  @return
> + *   - 0: Success
> + */
> +static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
> +  struct rte_mbuf **mbufs, unsigned count)
> +{
> + unsigned idx = 0;
> + int rc;
> +
> + rc = rte_mempool_get_bulk(pool, (void **)mbufs, count);
> + if (unlikely(rc))
> + return rc;
> +
> + /* To understand duff's device on loop unwinding optimization, see
> +  * https://en.wikipedia.org/wiki/Duff's_device.
> +  * Here while() loop is used rather than do() while{} to avoid extra
> +  * check if count is zero.
> +  */
> + switch (count % 4) {
> + case 0:
> + while (idx != count) {
> + RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0);
> + rte_mbuf_refcnt_set(mbufs[idx], 1);
> + rte_pktmbuf_reset(mbufs[idx]);
> + idx++;
> + case 3:
> + RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0);
> + rte_mbuf_refcnt_set(mbufs[idx], 1);
> + rte_pktmbuf_reset(mbufs[idx]);
> + idx++;
> + case 2:
> + RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0);
> + rte_mbuf_refcnt_set(mbufs[idx], 1);
> + rte_pktmbuf_reset(mbufs[idx]);
> + idx++;
> + case 1:
> + RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0);
> + rte_mbuf_refcnt_set(mbufs[idx], 1);
> + rte_pktmbuf_reset(mbufs[idx]);
> + idx++;
> + }
> + }
> + return 0;
> +}
> +
> +/**
>* Attach packet mbuf to another packet mbuf.
>*
>* After attachment we refer the mbuf we attached as 'indirect',
> diff --git a/lib/librte_mbuf/rte_mbuf_version.map 
> b/lib/librte_mbuf/rte_mbuf_version.map
> index e10f6bd..257c65a 100644
> --- a/lib/librte_mbuf/rte_mbuf_version.map
> +++ b/lib/librte_mbuf/rte_mbuf_version.map
> @@ -18,3 +18,10 @@ DPDK_2.1 {
>   rte_pktmbuf_pool_create;
>
>   } DPDK_2.0;
> +
> +DPDK_2.3 {
> + global:
> +
> + rte_pktmbuf_alloc_bulk;
> +
> +} DPDK_2.1;
>

Since rte_pktmbuf_alloc_bulk() is an inline function, it is not part of 
the library ABI and should not be listed in the version map.

I assume its inline for performance reasons, but then you lose the 
benefits of dynamic linking such as ability to fix bugs and/or improve 
itby just updating the library. Since the point of 

[dpdk-dev] Errors Rx count increasing while pktgen doing nothing on Intel 82598EB 10G

2016-01-27 Thread Moon-Sang Lee
Laurent, have you resolved this problem?
I'm using the same NIC as yours (i.e. Intel 82598EB 10G NIC) and faced the
same problem as you.
Here is parts of my log and it says that PMD cannot enable RX queue for my
NIC.
I'm using DPDK 2.2.0 and used 'null' for the 4th parameter in calling
rte_eth_rx_queue_setup().
(i.e. 'null' parameter provides the default rx_conf value.)

Thanks.



APP: initialising port 0 ...
PMD: ixgbe_dev_rx_queue_setup(): sw_ring=0x7f5f27258040
sw_sc_ring=0x7f5f27257b00 hw_ring=0x7f5f27258580 dma_addr=0x41f458580
PMD: ixgbe_dev_tx_queue_setup(): sw_ring=0x7f5f27245940
hw_ring=0x7f5f27247980 dma_addr=0x41f447980
PMD: ixgbe_set_tx_function(): Using simple tx code path
PMD: ixgbe_set_tx_function(): Vector tx enabled.
PMD: ixgbe_dev_tx_queue_setup(): sw_ring=0x7f5f272337c0
hw_ring=0x7f5f27235800 dma_addr=0x41f435800
PMD: ixgbe_set_tx_function(): Using simple tx code path
PMD: ixgbe_set_tx_function(): Vector tx enabled.
PMD: ixgbe_set_rx_function(): Vector rx enabled, please make sure RX burst
size no less than 4 (port=0).
*PMD: ixgbe_dev_rx_queue_start(): Could not enable Rx Queue 0*
APP: port 0 has started
APP: port 0 has entered in promiscuous mode
APP: port 0 initialization is done.
KNI: pci: 09:00:00 8086:10c7
APP: kni allocation is done for port 0.
APP: initialising port 1 ...
PMD: ixgbe_dev_rx_queue_setup(): sw_ring=0x7f5f27222dc0
sw_sc_ring=0x7f5f27222880 hw_ring=0x7f5f27223300 dma_addr=0x41f423300
PMD: ixgbe_dev_tx_queue_setup(): sw_ring=0x7f5f272106c0
hw_ring=0x7f5f27212700 dma_addr=0x41f412700
PMD: ixgbe_set_tx_function(): Using simple tx code path
PMD: ixgbe_set_tx_function(): Vector tx enabled.
PMD: ixgbe_dev_tx_queue_setup(): sw_ring=0x7f5f271fe540
hw_ring=0x7f5f27200580 dma_addr=0x41f400580
PMD: ixgbe_set_tx_function(): Using simple tx code path
PMD: ixgbe_set_tx_function(): Vector tx enabled.
PMD: ixgbe_set_rx_function(): Vector rx enabled, please make sure RX burst
size no less than 4 (port=1).
*PMD: ixgbe_dev_rx_queue_start(): Could not enable Rx Queue 0*
APP: port 1 has started
APP: port 1 has entered in promiscuous mode
APP: port 1 initialization is done.
KNI: pci: 0a:00:00 8086:10c7
APP: kni allocation is done for port 1.

checking link status
.done
Port 0 Link Up - speed 1 Mbps - full-duplex
Port 1 Link Up - speed 1 Mbps - full-duplex


On Mon, Dec 28, 2015 at 5:28 AM, Wiles, Keith  wrote:

> On 12/27/15, 2:09 PM, "Laurent GUERBY"  wrote:
>
> >On Sun, 2015-12-27 at 19:43 +, Wiles, Keith wrote:
> >> On 12/27/15, 12:31 PM, "dev on behalf of Laurent GUERBY" <
> dev-bounces at dpdk.org on behalf of laurent at guerby.net> wrote:
> >>
> >> >Hi,
> >> >
> >> >I reported today an issue when using Pktgen-DPDK:
> >> >https://github.com/pktgen/Pktgen-DPDK/issues/52
> >> >
> >> >But I think it's more in DPDK than pktgen
> >> >
> >> >two identical machines with SFP+ DA cable between them
> >> >DPDK 2.2.0 from tarball
> >> >Pktgen-DPDK from git
> >> >two identical machines:
> >> >core i7 2600 (sandy bridge 4C/8T), HT disabled in the BIOS
> >> >ASUS P8H67-M PRO BIOS 3904 (latest available)
> >> >Ethernet controller: Intel Corporation 82598EB 10-Gigabit AF Dual Port
> >> >Network Connection (rev 01)
> >> >01:00.0 0200: 8086:10f1 (rev 01)
> >> >Subsystem: 8086:a21f
> >> >boot kernel 3.16 unbutu 14.04 with isolcpus=2,3,4
> >> >
> >> >When launching pktgen even with no TX asked the Errors RX counters
> keeps
> >> >going up by about 7.4 millions per second:
> >> >
> >> >Errors Rx/Tx : 7471857054/0
> >> >
> >> >In the log I get "Could not enable Rx Queue", might be the
> >> >source of the issue?
> >> >
> >> >PMD: ixgbe_dev_rx_queue_start(): Could not enable Rx Queue 0
> >> >PMD: ixgbe_dev_rx_queue_start(): Could not enable Rx Queue 1
> >> >
> >> >When sending traffic  single UDP src/dst/IP/MAC the setup
> >> >reaches 14204188 pps 64 bytes, the error counter is also
> >> >increasing.
> >> >
> >> >Any idea what to look for?
> >>
> >> One more suggestion is to run test_pmd on one machine and something
> >> like iperf on the other to verify the DPDK is working correct, which I
> >> assume will be true. Not sure the RX errors are reported in the
> >> test_pmd or you could use the l3fwd application too.
> >
> >Ok, I will check the test_pmd documentation and try to do this test: I'm
> >just starting on DPDK :).
> >
> >> Please also send me the 'lspci | grep Ethernet? output.
> >
> >I included one line in my original email above (plus extract of lspci
> >-vn), here is the full output of the command:
> >
> >01:00.0 Ethernet controller: Intel Corporation 82598EB 10-Gigabit AF
> >Dual Port Network Connection (rev 01)
> >01:00.1 Ethernet controller: Intel Corporation 82598EB 10-Gigabit AF
> >Dual Port Network Connection (rev 01)
> >05:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd.
> >RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller (rev 06)
> >
> >(The realtek is used only for internet connectivity).
> >
> >> Also send me the command line.
> >
> >On the first 

[dpdk-dev] [PATCH V1 1/1] jobstats: added function abort for job

2016-01-27 Thread Panu Matilainen
On 01/26/2016 06:15 PM, Marcin Kerlin wrote:
> This patch adds new function rte_jobstats_abort. It marks *job* as finished
> and time of this work will be add to management time instead of execution 
> time.
> This function should be used instead of rte_jobstats_finish if condition 
> occure,
> condition is defined by the application for example when receiving n>0 
> packets.
>
> Signed-off-by: Marcin Kerlin 
> ---
>   lib/librte_jobstats/rte_jobstats.c   | 22 ++
>   lib/librte_jobstats/rte_jobstats.h   | 17 +
>   lib/librte_jobstats/rte_jobstats_version.map |  7 +++
>   3 files changed, 46 insertions(+)
>
[...]
> diff --git a/lib/librte_jobstats/rte_jobstats.h 
> b/lib/librte_jobstats/rte_jobstats.h
> index de6a89a..9995319 100644
> --- a/lib/librte_jobstats/rte_jobstats.h
> +++ b/lib/librte_jobstats/rte_jobstats.h
> @@ -90,6 +90,9 @@ struct rte_jobstats {
>   uint64_t exec_cnt;
>   /**< Execute count. */
>
> + uint64_t last_job_time;
> + /**< Last job time */
> +
>   char name[RTE_JOBSTATS_NAMESIZE];
>   /**< Name of this job */
>

AFAICS this is an ABI break and as such, needs to be preannounced, see 
http://dpdk.org/doc/guides/contributing/versioning.html
For 2.3 it'd need to be a CONFIG_RTE_NEXT_ABI feature.

- Panu -



[dpdk-dev] [PATCH] no need to test for NULL when freeing

2016-01-27 Thread Thomas Monjalon
2016-01-21 12:23, David Marchand:
> free() already handles NULL pointer.
> 
> Signed-off-by: David Marchand 

Applied, thanks


[dpdk-dev] [PATCH] rte.extvars.mk: allow overriding RTE_SDK_BIN from the environment

2016-01-27 Thread Thomas Monjalon
2016-01-20 21:15, Matthew Hall:
> On 1/20/16 7:27 AM, Thomas Monjalon wrote:
> > Hi Matthew,
> >
> > RTE_SDK_BIN is an internal variable and should not be overriden.
>  >
> > Have you installed DPDK somewhere? Example:
> > make install O=mybuild DESTDIR=mylocalinstall
> >
> > Then you should build your app like this:
> > make RTE_SDK=$(readlink -e ../dpdk/mylocalinstall/usr/local/share/dpdk)
> 
> Hello Thomas,
> 
> Is the way the make install target really works documented somewhere?

It is poorly described here:
http://dpdk.org/doc/guides/prog_guide/dev_kit_root_make_help.html#install-targets

> This target did not exist when I first used DPDK in 2011, and since then 
> I saw various documentation on building DPDK in various places, but not 
> that much explanation what make install actually does. I recall various 
> list threads about changing its behavior as well.

Historically, "make install" was a convenient default build (with T= option).
The DESTDIR option was added to make a real install after building.
The standard form (without T=) is now implemented to do a real install.

> For example, if I look at this apparently most official document:
> 
> http://dpdk.org/doc/guides/linux_gsg/build_dpdk.html
> 
> It has build examples such as:
> 
> make install T=x86_64-native-linuxapp-gcc

This command finishes with this message:
Installation cannot run with T defined and DESTDIR undefined

Yes you are right, some docs are neither complete nor up-to-date.
Volunteers are welcome.

> But it does not discuss "O=" or "DESTDIR=" or any other additional 
> options. From some experiments on my machine, it looks like maybe I 
> could do this:
> 
> make install "T=${RTE_TARGET}" "O=build" "DESTDIR=build"
> 
> Is that a valid possibility, to keep it all in one easy directory?

Yes you can install where you want.
Note that this command (with T= and O=) will build in the directory $O/$T
i.e. build/${RTE_TARGET} and install in build/

Please confirm that this patch is not needed. Thanks


[dpdk-dev] [PATCH 4/5] vhost: do not use rte_memcpy for virtio_hdr copy

2016-01-27 Thread Yuanhan Liu
On Wed, Jan 27, 2016 at 06:16:37AM +, Xie, Huawei wrote:
> On 1/27/2016 2:02 PM, Yuanhan Liu wrote:
> > On Wed, Jan 27, 2016 at 05:56:56AM +, Xie, Huawei wrote:
> >> On 1/27/2016 11:22 AM, Yuanhan Liu wrote:
> >>> On Wed, Jan 27, 2016 at 02:46:39AM +, Xie, Huawei wrote:
>  On 12/3/2015 2:03 PM, Yuanhan Liu wrote:
> > +   if (vq->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) {
> > +   *(struct virtio_net_hdr_mrg_rxbuf 
> > *)(uintptr_t)desc_addr = hdr;
> > +   } else {
> > +   *(struct virtio_net_hdr *)(uintptr_t)desc_addr = 
> > hdr.hdr;
> > +   }
>  Thanks!
>  We might simplify this further. Just reset the first two fields flags
>  and gso_type.
> >>> What's this "simplification" for? Don't even to say that we will add
> >>> TSO support, which modifies few more files, such as csum_start: reseting
> >>> the first two fields only is wrong here.
> >> I know TSO before commenting, but at least in this implementation and
> >> this specific patch, i guess zeroing two fields are enough.
> >>
> >> What is wrong resetting only two fields?
> > I then have to ask "What is the benifit of resetting only two fields"?
> > If doing so, we have to change it back for TSO. My proposal requires no
> > extra change when adding TSO support.
> 
> ? Benefit is we save four unnecessary stores.

Hmm..., the hdr size is 12 bytes at most. I mean, does it really matter,
coping 3 bytes, or coping 12 bytes in a row?

--yliu


[dpdk-dev] [PATCH v4] vfio: Support for no-IOMMU mode

2016-01-27 Thread Anatoly Burakov
This commit is adding a generic mechanism to support multiple IOMMU
types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special
VFIO mode that doesn't use IOMMU at all), but it's easily extended
by adding necessary definitions into eal_pci_init.h and a DMA
mapping function to eal_pci_vfio.c.

Since type 1 IOMMU module is no longer necessary to have VFIO,
we fix the module check to check for vfio-pci instead. It's not
ideal and triggers VFIO checks more often (and thus produces more
error output, which was the reason behind the module check in the
first place), so we compensate for that by providing more verbose
logging, indicating whether VFIO initialization has succeeded or
failed.

Signed-off-by: Anatoly Burakov 
Signed-off-by: Santosh Shukla 
Tested-by: Santosh Shukla 
---
v4 changes:
  Fixed the commit message and added a missing sign-off

v3 changes:
  Merging DMA mapping functions back into eal_pci_vfio.c
  Fixing and adding comments

v2 changes:
  Compile fix (hat-tip to Santosh Shukla)
  Tested-by is provisional, since only superficial testing was done

 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 205 +
 lib/librte_eal/linuxapp/eal/eal_vfio.h |   5 +
 2 files changed, 157 insertions(+), 53 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c 
b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 74f91ba..fdf334b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -72,11 +72,74 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq)
 #define VFIO_DIR "/dev/vfio"
 #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
 #define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
 #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)

 /* per-process VFIO config */
 static struct vfio_config vfio_cfg;

+/* DMA mapping function prototype.
+ * Takes VFIO container fd as a parameter.
+ * Returns 0 on success, -1 on error.
+ * */
+typedef  int (*vfio_dma_func_t)(int);
+
+struct vfio_iommu_type {
+   int type_id;
+   const char *name;
+   vfio_dma_func_t dma_map_func;
+};
+
+int vfio_iommu_type1_dma_map(int);
+int vfio_iommu_noiommu_dma_map(int);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_type iommu_types[] = {
+   /* x86 IOMMU, otherwise known as type 1 */
+   { VFIO_TYPE1_IOMMU, "Type 1", _iommu_type1_dma_map},
+   /* IOMMU-less mode */
+   { VFIO_NOIOMMU_IOMMU, "No-IOMMU", _iommu_noiommu_dma_map},
+};
+
+int
+vfio_iommu_type1_dma_map(int vfio_container_fd)
+{
+   const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+   int i, ret;
+
+   /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+   struct vfio_iommu_type1_dma_map dma_map;
+
+   if (ms[i].addr == NULL)
+   break;
+
+   memset(_map, 0, sizeof(dma_map));
+   dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+   dma_map.vaddr = ms[i].addr_64;
+   dma_map.size = ms[i].len;
+   dma_map.iova = ms[i].phys_addr;
+   dma_map.flags = VFIO_DMA_MAP_FLAG_READ | 
VFIO_DMA_MAP_FLAG_WRITE;
+
+   ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, _map);
+
+   if (ret) {
+   RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
+   "error %i (%s)\n", errno, 
strerror(errno));
+   return -1;
+   }
+   }
+
+   return 0;
+}
+
+int
+vfio_iommu_noiommu_dma_map(int __rte_unused vfio_container_fd)
+{
+   /* No-IOMMU mode does not need DMA mapping */
+   return 0;
+}
+
 int
 pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
void *buf, size_t len, off_t offs)
@@ -208,42 +271,58 @@ pci_vfio_set_bus_master(int dev_fd)
return 0;
 }

-/* set up DMA mappings */
-static int
-pci_vfio_setup_dma_maps(int vfio_container_fd)
-{
-   const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-   int i, ret;
-
-   ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
-   VFIO_TYPE1_IOMMU);
-   if (ret) {
-   RTE_LOG(ERR, EAL, "  cannot set IOMMU type, "
-   "error %i (%s)\n", errno, strerror(errno));
-   return -1;
+/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
+static const struct vfio_iommu_type *
+pci_vfio_set_iommu_type(int vfio_container_fd) {
+   unsigned idx;
+   for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+   const struct vfio_iommu_type *t = _types[idx];
+
+   int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+   t->type_id);
+   if (!ret) {
+   RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
+ 

[dpdk-dev] [PATCH v3] vfio: Support for no-IOMMU mode

2016-01-27 Thread Burakov, Anatoly
Apologies, lost the signoff from Santosh Shukla and also the commit message 
still mentions the file that is now non-existent, so I'll submit a v4.

Thanks,
Anatoly


> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Anatoly Burakov
> Sent: Wednesday, January 27, 2016 2:05 PM
> To: dev at dpdk.org
> Subject: [dpdk-dev] [PATCH v3] vfio: Support for no-IOMMU mode
> 
> This commit is adding a generic mechanism to support multiple IOMMU
> types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special VFIO
> mode that doesn't use IOMMU at all), but it's easily extended by adding
> necessary definitions into eal_pci_init.h and a DMA mapping function to
> eal_pci_vfio_dma.c.
> 
> Since type 1 IOMMU module is no longer necessary to have VFIO, we fix the
> module check to check for vfio-pci instead. It's not ideal and triggers VFIO
> checks more often (and thus produces more error output, which was the
> reason behind the module check in the first place), so we compensate for
> that by providing more verbose logging, indicating whether VFIO initialization
> has succeeded or failed.
> 
> Signed-off-by: Anatoly Burakov 
> Tested-by: Santosh Shukla 
> ---
> v3 changes:
>   Merging DMA mapping functions back into eal_pci_vfio.c
>   Fixing and adding comments
> 
> v2 changes:
>   Compile fix (hat-tip to Santosh Shukla)
>   Tested-by is provisional, since only superficial testing was done
> 
>  lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 205 +--
> --
>  lib/librte_eal/linuxapp/eal/eal_vfio.h |   5 +
>  2 files changed, 157 insertions(+), 53 deletions(-)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> index 74f91ba..fdf334b 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> @@ -72,11 +72,74 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq)
>  #define VFIO_DIR "/dev/vfio"
>  #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
>  #define VFIO_GROUP_FMT "/dev/vfio/%u"
> +#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
>  #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
> 
>  /* per-process VFIO config */
>  static struct vfio_config vfio_cfg;
> 
> +/* DMA mapping function prototype.
> + * Takes VFIO container fd as a parameter.
> + * Returns 0 on success, -1 on error.
> + * */
> +typedef  int (*vfio_dma_func_t)(int);
> +
> +struct vfio_iommu_type {
> + int type_id;
> + const char *name;
> + vfio_dma_func_t dma_map_func;
> +};
> +
> +int vfio_iommu_type1_dma_map(int);
> +int vfio_iommu_noiommu_dma_map(int);
> +
> +/* IOMMU types we support */
> +static const struct vfio_iommu_type iommu_types[] = {
> + /* x86 IOMMU, otherwise known as type 1 */
> + { VFIO_TYPE1_IOMMU, "Type 1",
> _iommu_type1_dma_map},
> + /* IOMMU-less mode */
> + { VFIO_NOIOMMU_IOMMU, "No-IOMMU",
> _iommu_noiommu_dma_map}, };
> +
> +int
> +vfio_iommu_type1_dma_map(int vfio_container_fd) {
> + const struct rte_memseg *ms = rte_eal_get_physmem_layout();
> + int i, ret;
> +
> + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> + for (i = 0; i < RTE_MAX_MEMSEG; i++) {
> + struct vfio_iommu_type1_dma_map dma_map;
> +
> + if (ms[i].addr == NULL)
> + break;
> +
> + memset(_map, 0, sizeof(dma_map));
> + dma_map.argsz = sizeof(struct
> vfio_iommu_type1_dma_map);
> + dma_map.vaddr = ms[i].addr_64;
> + dma_map.size = ms[i].len;
> + dma_map.iova = ms[i].phys_addr;
> + dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
> VFIO_DMA_MAP_FLAG_WRITE;
> +
> + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA,
> _map);
> +
> + if (ret) {
> + RTE_LOG(ERR, EAL, "  cannot set up DMA remapping,
> "
> + "error %i (%s)\n", errno,
> strerror(errno));
> + return -1;
> + }
> + }
> +
> + return 0;
> +}
> +
> +int
> +vfio_iommu_noiommu_dma_map(int __rte_unused vfio_container_fd) {
> + /* No-IOMMU mode does not need DMA mapping */
> + return 0;
> +}
> +
>  int
>  pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
>   void *buf, size_t len, off_t offs) @@ -208,42 +271,58 @@
> pci_vfio_set_bus_master(int dev_fd)
>   return 0;
>  }
> 
> -/* set up DMA mappings */
> -static int
> -pci_vfio_setup_dma_maps(int vfio_container_fd) -{
> - const struct rte_memseg *ms = rte_eal_get_physmem_layout();
> - int i, ret;
> -
> - ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
> - VFIO_TYPE1_IOMMU);
> - if (ret) {
> - RTE_LOG(ERR, EAL, "  cannot set IOMMU type, "
> - "error %i (%s)\n", errno, strerror(errno));
> - return -1;
> +/* pick IOMMU type. returns a pointer to vfio_iommu_type 

[dpdk-dev] [PATCH 1/5] vhost: refactor rte_vhost_dequeue_burst

2016-01-27 Thread Yuanhan Liu
On Wed, Jan 27, 2016 at 06:12:22AM +, Xie, Huawei wrote:
> On 1/27/2016 11:26 AM, Yuanhan Liu wrote:
> > On Tue, Jan 26, 2016 at 10:30:12AM +, Xie, Huawei wrote:
> >> On 12/3/2015 2:03 PM, Yuanhan Liu wrote:
> >>> Signed-off-by: Yuanhan Liu 
> >>> ---
> >>>  lib/librte_vhost/vhost_rxtx.c | 287 
> >>> +-
> >>>  1 file changed, 113 insertions(+), 174 deletions(-)
> >> Prefer to unroll copy_mbuf_to_desc and your COPY macro. It prevents us
> > I'm okay to unroll COPY macro. But for copy_mbuf_to_desc, I prefer not
> > to do that, unless it has a good reason.
> >
> >> processing descriptors in a burst way in future.
> > So, do you have a plan?
> 
> I think it is OK. If we need unroll in future, we could do that then. I
> am open to this. Just my preference. I understand that wrapping makes
> code more readable.

Okay, let's consider it then: unroll would be easy after all.

--yliu


[dpdk-dev] [PATCH 0/9] pci cleanup and blacklist rework

2016-01-27 Thread David Marchand
On Fri, Jan 22, 2016 at 4:27 PM, David Marchand
 wrote:
> The 4th patch introduces a change in linux eal.
> Before, if a pci device was bound to no kernel driver, eal would set kdrv
> to "unknown". With this change, kdrv is set to "none".
> This might make it possible to avoid the old issue of virtio devices being
> used by dpdk while still bound to kernel driver reported by Franck B..
> I'll let virtio guys look at this.
> At the very least, it makes more sense to me.

Ok, actually, I had forgotten that Huawei had already sent a similar change [1].
So I suppose this patch commitlog is wrong, but the patch itself is
still worth for the cleanup.

Thomas, I suppose you will integrate Huawei patches first.
Then I will rebase and fix the commitlog.

[1] http://dpdk.org/dev/patchwork/patch/9718/


-- 
David Marchand


[dpdk-dev] [PATCH v3] vfio: Support for no-IOMMU mode

2016-01-27 Thread Anatoly Burakov
This commit is adding a generic mechanism to support multiple IOMMU
types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special
VFIO mode that doesn't use IOMMU at all), but it's easily extended
by adding necessary definitions into eal_pci_init.h and a DMA
mapping function to eal_pci_vfio_dma.c.

Since type 1 IOMMU module is no longer necessary to have VFIO,
we fix the module check to check for vfio-pci instead. It's not
ideal and triggers VFIO checks more often (and thus produces more
error output, which was the reason behind the module check in the
first place), so we compensate for that by providing more verbose
logging, indicating whether VFIO initialization has succeeded or
failed.

Signed-off-by: Anatoly Burakov 
Tested-by: Santosh Shukla 
---
v3 changes:
  Merging DMA mapping functions back into eal_pci_vfio.c
  Fixing and adding comments

v2 changes:
  Compile fix (hat-tip to Santosh Shukla)
  Tested-by is provisional, since only superficial testing was done

 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 205 +
 lib/librte_eal/linuxapp/eal/eal_vfio.h |   5 +
 2 files changed, 157 insertions(+), 53 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c 
b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 74f91ba..fdf334b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -72,11 +72,74 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq)
 #define VFIO_DIR "/dev/vfio"
 #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
 #define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
 #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)

 /* per-process VFIO config */
 static struct vfio_config vfio_cfg;

+/* DMA mapping function prototype.
+ * Takes VFIO container fd as a parameter.
+ * Returns 0 on success, -1 on error.
+ * */
+typedef  int (*vfio_dma_func_t)(int);
+
+struct vfio_iommu_type {
+   int type_id;
+   const char *name;
+   vfio_dma_func_t dma_map_func;
+};
+
+int vfio_iommu_type1_dma_map(int);
+int vfio_iommu_noiommu_dma_map(int);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_type iommu_types[] = {
+   /* x86 IOMMU, otherwise known as type 1 */
+   { VFIO_TYPE1_IOMMU, "Type 1", _iommu_type1_dma_map},
+   /* IOMMU-less mode */
+   { VFIO_NOIOMMU_IOMMU, "No-IOMMU", _iommu_noiommu_dma_map},
+};
+
+int
+vfio_iommu_type1_dma_map(int vfio_container_fd)
+{
+   const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+   int i, ret;
+
+   /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+   struct vfio_iommu_type1_dma_map dma_map;
+
+   if (ms[i].addr == NULL)
+   break;
+
+   memset(_map, 0, sizeof(dma_map));
+   dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+   dma_map.vaddr = ms[i].addr_64;
+   dma_map.size = ms[i].len;
+   dma_map.iova = ms[i].phys_addr;
+   dma_map.flags = VFIO_DMA_MAP_FLAG_READ | 
VFIO_DMA_MAP_FLAG_WRITE;
+
+   ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, _map);
+
+   if (ret) {
+   RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
+   "error %i (%s)\n", errno, 
strerror(errno));
+   return -1;
+   }
+   }
+
+   return 0;
+}
+
+int
+vfio_iommu_noiommu_dma_map(int __rte_unused vfio_container_fd)
+{
+   /* No-IOMMU mode does not need DMA mapping */
+   return 0;
+}
+
 int
 pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
void *buf, size_t len, off_t offs)
@@ -208,42 +271,58 @@ pci_vfio_set_bus_master(int dev_fd)
return 0;
 }

-/* set up DMA mappings */
-static int
-pci_vfio_setup_dma_maps(int vfio_container_fd)
-{
-   const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-   int i, ret;
-
-   ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
-   VFIO_TYPE1_IOMMU);
-   if (ret) {
-   RTE_LOG(ERR, EAL, "  cannot set IOMMU type, "
-   "error %i (%s)\n", errno, strerror(errno));
-   return -1;
+/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
+static const struct vfio_iommu_type *
+pci_vfio_set_iommu_type(int vfio_container_fd) {
+   unsigned idx;
+   for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+   const struct vfio_iommu_type *t = _types[idx];
+
+   int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+   t->type_id);
+   if (!ret) {
+   RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
+   t->type_id, t->name);
+   return t;
+ 

[dpdk-dev] [PATCH 4/5] vhost: do not use rte_memcpy for virtio_hdr copy

2016-01-27 Thread Yuanhan Liu
On Wed, Jan 27, 2016 at 05:56:56AM +, Xie, Huawei wrote:
> On 1/27/2016 11:22 AM, Yuanhan Liu wrote:
> > On Wed, Jan 27, 2016 at 02:46:39AM +, Xie, Huawei wrote:
> >> On 12/3/2015 2:03 PM, Yuanhan Liu wrote:
> >>> + if (vq->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) {
> >>> + *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
> >>> + } else {
> >>> + *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
> >>> + }
> >> Thanks!
> >> We might simplify this further. Just reset the first two fields flags
> >> and gso_type.
> > What's this "simplification" for? Don't even to say that we will add
> > TSO support, which modifies few more files, such as csum_start: reseting
> > the first two fields only is wrong here.
> 
> I know TSO before commenting, but at least in this implementation and
> this specific patch, i guess zeroing two fields are enough.
> 
> What is wrong resetting only two fields?

I then have to ask "What is the benifit of resetting only two fields"?
If doing so, we have to change it back for TSO. My proposal requires no
extra change when adding TSO support.

--yliu


[dpdk-dev] [PATCH v2 4/4] virtio: check if any kernel driver is manipulating the virtio device

2016-01-27 Thread Thomas Monjalon
2016-01-07 16:17, Panu Matilainen:
> On 01/03/2016 07:56 PM, Huawei Xie wrote:
> > v2 changes:
> >   change LOG level from ERR to INFO
> >
> > virtio PMD could use IO port to configure the virtio device without
> > using uio driver.
> >
> > There are two issues with previous implementation:
> > 1) virtio PMD will take over each virtio device blindly even if some
> > are not intended for DPDK.
> > 2) driver conflict between virtio PMD and virtio-net kernel driver.
> >
> > This patch checks if there is any kernel driver manipulating the virtio
> > device before virtio PMD uses IO port to configure the device.
> >
> > Fixes: da978dfdc43b ("virtio: use port IO to get PCI resource")
> >
> > Signed-off-by: Huawei Xie 
> > ---
> >   drivers/net/virtio/virtio_ethdev.c | 7 +++
> >   1 file changed, 7 insertions(+)
> >
> > diff --git a/drivers/net/virtio/virtio_ethdev.c 
> > b/drivers/net/virtio/virtio_ethdev.c
> > index e815acd..7a50dac 100644
> > --- a/drivers/net/virtio/virtio_ethdev.c
> > +++ b/drivers/net/virtio/virtio_ethdev.c
> > @@ -1138,6 +1138,13 @@ static int virtio_resource_init_by_ioports(struct 
> > rte_pci_device *pci_dev)
> > int found = 0;
> > size_t linesz;
> >
> > +   if (pci_dev->kdrv != RTE_KDRV_NONE) {
> > +   PMD_INIT_LOG(INFO,
> > +   "kernel driver is manipulating this device." \
> > +   " Please unbind the kernel driver.");
> 
> At the very least this message needs to be changed.
> 
> Like said earlier, I think the message could just as well be dropped 
> entirely, but at least it should be something to the tune of "ignoring 
> kernel owned device" instead of asking the user to break their 
> configuration.

Huawei, a v3 is required. Thanks


[dpdk-dev] [PATCH] vfio/noiommu: Don't use iommu_present() to track fake groups

2016-01-27 Thread Burakov, Anatoly
Hi Alex,

> On 01/23/2016 04:23 AM, Alex Williamson wrote:
> > Using iommu_present() to determine whether an IOMMU group is real or
> > fake has some problems.  First, apparently Power systems don't
> > register an IOMMU on the device bus, so the groups and containers get
> > marked as noiommu and then won't bind to their actual IOMMU driver.
> > Second, I expect we'll run into the same issue as we try to support
> > vGPUs through vfio, since they're likely to emulate this behavior of
> > creating an IOMMU group on a virtual device and then providing a vfio
> > IOMMU backend tailored to the sort of isolation they provide, which
> > won't necessarily be fully compatible with the IOMMU API.
> >
> > The solution here is to use the existing iommudata interface to IOMMU
> > groups, which allows us to easily identify the fake groups we've
> > created for noiommu purposes.  The iommudata we set is purely
> > arbitrary since we're only comparing the address, so we use the
> > address of the noiommu switch itself.
> >
> > Reported-by: Alexey Kardashevskiy 
> > Fixes: 03a76b60f8ba ("vfio: Include No-IOMMU mode")
> > Signed-off-by: Alex Williamson 
> 
> 
> 
> Reviewed-by: Alexey Kardashevskiy 
> Tested-by: Alexey Kardashevskiy 

Tested bringing the NIC's up, encountered no issues. Curious if it also works 
for Santosh (CC'd) as he's one of the intended users of the No-IOMMU 
functionality, but otherwise seems to work.

Thanks,
Anatoly


[dpdk-dev] [RFC] eal: add cgroup-aware resource self discovery

2016-01-27 Thread Neil Horman
On Wed, Jan 27, 2016 at 08:02:27PM +0800, Tan, Jianfeng wrote:
> Hi Neil,
> 
> On 1/26/2016 10:19 PM, Neil Horman wrote:
> >On Tue, Jan 26, 2016 at 10:22:18AM +0800, Tan, Jianfeng wrote:
> >>Hi Neil,
> >>
> >>On 1/25/2016 9:46 PM, Neil Horman wrote:
> >>>On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote:
> >>...
> -- 
> 2.1.4
> 
> 
> >>>This doesn't make a whole lot of sense, for several reasons:
> >>>
> >>>1) Applications, as a general rule shouldn't be interrogating the cgroups
> >>>interface at all.
> >>The main reason to do this in DPDK is that DPDK obtains resource information
> >>from sysfs and proc, which are not well containerized so far. And DPDK
> >>pre-allocates resource instead of on-demand gradual allocating.
> >>
> >Not disagreeing with this, just suggesting that:
> >
> >1) Interrogating cgroups really isn't the best way to collect that 
> >information
> >2) Pre-allocating those resources isn't particularly wise without some 
> >mechanism
> >to reallocate it, as resource constraints can change (consider your cpuset
> >getting rewritten)
> 
> In the case of reallocate,
> For cpuset, DPDK panics in the initialization if set_affinity fails, but
> after that, cpuset rewritten will not bring any problem I believe.
Yes, that seems reasonable, but I think you need to update
rte_thread_set_affinity to not assume that success in pthread_setaffinity_np
means that all cpus in the provided mask are available.  That is to say, cpusetp
is subsequently stored in lore information after the set, but may not reflect
the actual working set of processors, you should follow a successful set with a
call to pthread_getaffinity_np to retrieve the actual working cpuset

As for subsequent changes to the cpuset, I'm not sure how you want to handle
that. I would think that you might want to run a check periodically or alow for
a SIGHUP or some other signal to trigger a rescan of your working cpuset so as
to keep the application in sync with the system.

> For memory, a running application uses 2G hugepages, then admin decreases
> hugetlb cgroup into 1G, the application will not get killed, unless it tries
> to access more hugepages (I'll double check this).
> 
No, the semantics should be identical to malloc/mmap (if you use the 
alloc_hugepages
api or the mmap api).  You should get a NULL return or other no fatal indicator
if you allocate more than is available.

> So another way to address this problem is to add an option that DPDK tries
> best to allocate those resources, and if fails, it just posts a warning and
> uses those allocated resources, instead of panic. What do you think?
> 
Yes, that makes sense

> >
> >>>2) Cgroups aren't the only way in which a cpuset or memoryset can be 
> >>>restricted
> >>>(the isolcpus command line argument, or a taskset on a parent process for
> >>>instance, but there are several others).
> >>Yes, I agree. To enable that, I'd like design the new API for resource self
> >>discovery in a flexible way. A parameter "type" is used to specify the
> >>solution to discovery way. In addition, I'm considering to add a callback
> >>function pointer so that users can write their own resource discovery
> >>functions.
> >>
> >Why?  You don't need an API for this, or if you really want one, it can be 
> >very
> >generic if you use POSIX apis to gather the information.  What you have here 
> >is
> >going to be very linux specific, and will need reimplementing for BSD or 
> >other
> >operating systems.  To use the cpuset example, instead of reading and parsing
> >the mask files in the cgroup filesystem module to find your task and
> >corresponding mask, just call sched_setaffinity with an all f's mask, then 
> >call
> >sched_getaffinity.  The returned mask will be all the cpus your process is
> >allowed to execute on, taking into account every limiting filter the system 
> >you
> >are running on offers.
> 
> Yes, it makes sense on cpu's side.
> 
> >
> >There are simmilar OS level POSIX apis for most resources out there.  You 
> >really
> >don't need to dig through cgroups just to learn what some of those reources 
> >are.
> >
> >>>Instead of trying to figure out what cpuset is valid for your process by
> >>>interrogating the cgroups heirarchy, instead you should follow the 
> >>>proscribed
> >>>method of calling sched_getaffinity after calling sched_setaffinity.  That 
> >>>will
> >>>give you the canonical cpuset that you are executing on, taking all cpuset
> >>>filters into account (including cgroups and any other restrictions).  Its 
> >>>far
> >>>simpler as well, as it doesn't require a ton of file/string processing.
> >>Yes, this way is much better for cpuset discovery. But is there such a
> >>syscall for hugepages?
> >>
> >In what capacity?  Interrogating how many hugepages you have, or to what node
> >they are affined to?  Capacity would require reading the requisite proc 
> >file, as
> >theres no posix api for this resource.  Node affinity can be implied by 
> 

[dpdk-dev] [PATCH v2 15/16] fm10k/base: move constants to the right of binary operators

2016-01-27 Thread Wang Xiao W
The upstream Linux kernel community prefers constants are to the right of
binary operators.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_pf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/fm10k/base/fm10k_pf.c 
b/drivers/net/fm10k/base/fm10k_pf.c
index 456fe64..105babf 100644
--- a/drivers/net/fm10k/base/fm10k_pf.c
+++ b/drivers/net/fm10k/base/fm10k_pf.c
@@ -759,8 +759,8 @@ STATIC s32 fm10k_iov_assign_resources_pf(struct fm10k_hw 
*hw, u16 num_vfs,
FM10K_RXDCTL_WRITE_BACK_MIN_DELAY |
FM10K_RXDCTL_DROP_ON_EMPTY);
FM10K_WRITE_REG(hw, FM10K_RXQCTL(vf_q_idx),
-   FM10K_RXQCTL_VF |
-   (i << FM10K_RXQCTL_VF_SHIFT));
+   (i << FM10K_RXQCTL_VF_SHIFT) |
+   FM10K_RXQCTL_VF);

/* map queue pair to VF */
FM10K_WRITE_REG(hw, FM10K_TQMAP(qmap_idx), vf_q_idx);
@@ -1035,7 +1035,7 @@ STATIC s32 fm10k_iov_reset_resources_pf(struct fm10k_hw 
*hw,
txqctl = ((u32)vf_vid << FM10K_TXQCTL_VID_SHIFT) |
 (vf_idx << FM10K_TXQCTL_TC_SHIFT) |
 FM10K_TXQCTL_VF | vf_idx;
-   rxqctl = FM10K_RXQCTL_VF | (vf_idx << FM10K_RXQCTL_VF_SHIFT);
+   rxqctl = (vf_idx << FM10K_RXQCTL_VF_SHIFT) | FM10K_RXQCTL_VF;

/* stop further DMA and reset queue ownership back to VF */
for (i = vf_q_idx; i < (queues_per_pool + vf_q_idx); i++) {
-- 
1.9.3



[dpdk-dev] [PATCH v2 14/16] fm10k/base: TLV structures must be 4byte aligned, not 1byte aligned

2016-01-27 Thread Wang Xiao W
Per comments from an upstream patch, and looking at how TLV LE_STRUCT
code works, we actually want these structures to be 4byte aligned, not
1byte aligned. In practice, 1byte alignment has worked so far because
all our structures end up being a multiple of 4. But if a future TLV
structure were added that had a u8 or similar sticking on the end things
would break. Fix this by using 4byte alignment which will prevent the
TLV LE_STRUCT code from breaking. Update the comment explaining that we
need 4byte alignment of our structures.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_pf.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/fm10k/base/fm10k_pf.h 
b/drivers/net/fm10k/base/fm10k_pf.h
index 92e2962..ee8527a 100644
--- a/drivers/net/fm10k/base/fm10k_pf.h
+++ b/drivers/net/fm10k/base/fm10k_pf.h
@@ -91,14 +91,14 @@ enum fm10k_pf_tlv_attr_id_v1 {
 #define FM10K_MSG_UPDATE_PVID_PVID_SHIFT   16
 #define FM10K_MSG_UPDATE_PVID_PVID_SIZE16

-/* The following data structures are overlayed specifically to TLV mailbox
- * messages, and must not have gaps between their values. They must line up
- * correctly to the TLV definition.
+/* The following data structures are overlayed directly onto TLV mailbox
+ * messages, and must not break 4 byte alignment. Ensure the structures line
+ * up correctly as per their TLV definition.
  */
 #ifdef C99
-#pragma pack(push, 1)
+#pragma pack(push, 4)
 #else
-#pragma pack(1)
+#pragma pack(4)
 #endif /* C99 */

 struct fm10k_mac_update {
-- 
1.9.3



[dpdk-dev] [PATCH v2 13/16] fm10k/base: fix comment per upstream review changes

2016-01-27 Thread Wang Xiao W
The comment here was changed during review of upstream patch, and the
new wording is slightly more clear. Re-write the comment in SHARED code
based on this new wording.

Fix a number of mailbox comment issues with function header comments,
lower-case acronyms (i.e. FIFO, TLV), incorrect function names in DEBUGFUNC(),
duplicate comments and a stubbed-out header comment for fm10k_sm_mbx_init.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_mbx.c | 61 ++
 drivers/net/fm10k/base/fm10k_mbx.h |  4 +--
 drivers/net/fm10k/base/fm10k_pf.c  | 12 
 drivers/net/fm10k/base/fm10k_tlv.h |  4 +--
 4 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/drivers/net/fm10k/base/fm10k_mbx.c 
b/drivers/net/fm10k/base/fm10k_mbx.c
index 7d03704..2e70434 100644
--- a/drivers/net/fm10k/base/fm10k_mbx.c
+++ b/drivers/net/fm10k/base/fm10k_mbx.c
@@ -70,7 +70,7 @@ STATIC u16 fm10k_fifo_unused(struct fm10k_mbx_fifo *fifo)
 }

 /**
- *  fm10k_fifo_empty - Test to verify if fifo is empty
+ *  fm10k_fifo_empty - Test to verify if FIFO is empty
  *  @fifo: pointer to FIFO
  *
  *  This function returns true if the FIFO is empty, else false
@@ -85,7 +85,7 @@ STATIC bool fm10k_fifo_empty(struct fm10k_mbx_fifo *fifo)
  *  @fifo: pointer to FIFO
  *  @offset: offset to add to head
  *
- *  This function returns the indices into the fifo based on head + offset
+ *  This function returns the indices into the FIFO based on head + offset
  **/
 STATIC u16 fm10k_fifo_head_offset(struct fm10k_mbx_fifo *fifo, u16 offset)
 {
@@ -97,7 +97,7 @@ STATIC u16 fm10k_fifo_head_offset(struct fm10k_mbx_fifo 
*fifo, u16 offset)
  *  @fifo: pointer to FIFO
  *  @offset: offset to add to tail
  *
- *  This function returns the indices into the fifo based on tail + offset
+ *  This function returns the indices into the FIFO based on tail + offset
  **/
 STATIC u16 fm10k_fifo_tail_offset(struct fm10k_mbx_fifo *fifo, u16 offset)
 {
@@ -173,7 +173,7 @@ STATIC u16 fm10k_mbx_index_len(struct fm10k_mbx_info *mbx, 
u16 head, u16 tail)
 /**
  *  fm10k_mbx_tail_add - Determine new tail value with added offset
  *  @mbx: pointer to mailbox
- *  @offset: length to add to head offset
+ *  @offset: length to add to tail offset
  *
  *  This function takes the local tail index and recomputes it for
  *  a given length added as an offset.
@@ -189,7 +189,7 @@ STATIC u16 fm10k_mbx_tail_add(struct fm10k_mbx_info *mbx, 
u16 offset)
 /**
  *  fm10k_mbx_tail_sub - Determine new tail value with subtracted offset
  *  @mbx: pointer to mailbox
- *  @offset: length to add to head offset
+ *  @offset: length to add to tail offset
  *
  *  This function takes the local tail index and recomputes it for
  *  a given length added as an offset.
@@ -253,7 +253,7 @@ STATIC u16 fm10k_mbx_pushed_tail_len(struct fm10k_mbx_info 
*mbx)
 }

 /**
- *  fm10k_fifo_write_copy - pulls data off of msg and places it in fifo
+ *  fm10k_fifo_write_copy - pulls data off of msg and places it in FIFO
  *  @fifo: pointer to FIFO
  *  @msg: message array to populate
  *  @tail_offset: additional offset to add to tail pointer
@@ -331,7 +331,7 @@ STATIC u16 fm10k_mbx_validate_msg_size(struct 
fm10k_mbx_info *mbx, u16 len)
u16 total_len = 0, msg_len;
u32 *msg;

-   DEBUGFUNC("fm10k_mbx_validate_msg");
+   DEBUGFUNC("fm10k_mbx_validate_msg_size");

/* length should include previous amounts pushed */
len += mbx->pushed;
@@ -353,6 +353,7 @@ STATIC u16 fm10k_mbx_validate_msg_size(struct 
fm10k_mbx_info *mbx, u16 len)

 /**
  *  fm10k_mbx_write_copy - pulls data off of Tx FIFO and places it in mbmem
+ *  @hw: pointer to hardware structure
  *  @mbx: pointer to mailbox
  *
  *  This function will take a section of the Tx FIFO and copy it into the
@@ -734,7 +735,7 @@ STATIC bool fm10k_mbx_tx_complete(struct fm10k_mbx_info 
*mbx)
  *  @hw: pointer to hardware structure
  *  @mbx: pointer to mailbox
  *
- *  This function dequeues messages and hands them off to the tlv parser.
+ *  This function dequeues messages and hands them off to the TLV parser.
  *  It will return the number of messages processed when called.
  **/
 STATIC u16 fm10k_mbx_dequeue_rx(struct fm10k_hw *hw,
@@ -951,7 +952,7 @@ STATIC void fm10k_mbx_create_fake_disconnect_hdr(struct 
fm10k_mbx_info *mbx)
 }

 /**
- *  fm10k_mbx_create_error_msg - Generate a error message
+ *  fm10k_mbx_create_error_msg - Generate an error message
  *  @mbx: pointer to mailbox
  *  @err: local error encountered
  *
@@ -984,7 +985,6 @@ STATIC void fm10k_mbx_create_error_msg(struct 
fm10k_mbx_info *mbx, s32 err)
 /**
  *  fm10k_mbx_validate_msg_hdr - Validate common fields in the message header
  *  @mbx: pointer to mailbox
- *  @msg: message array to read
  *
  *  This function will parse up the fields in the mailbox header and return
  *  an error if the header contains any of a number of invalid configurations
@@ -1050,11 +1050,12 @@ STATIC s32 

[dpdk-dev] [PATCH v2 12/16] fm10k/base: consistently use VLAN ID when referencing vid variables

2016-01-27 Thread Wang Xiao W
The vid variable name is shorthand for VLAN ID, so we should use this in
comments explaining what is happening.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_pf.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/fm10k/base/fm10k_pf.c 
b/drivers/net/fm10k/base/fm10k_pf.c
index f5cbda4..716d7f1 100644
--- a/drivers/net/fm10k/base/fm10k_pf.c
+++ b/drivers/net/fm10k/base/fm10k_pf.c
@@ -970,7 +970,7 @@ err_out:
txqctl |= (vf_idx << FM10K_TXQCTL_TC_SHIFT) |
  FM10K_TXQCTL_VF | vf_idx;

-   /* assign VID */
+   /* assign VLAN ID */
for (i = 0; i < queues_per_pool; i++)
FM10K_WRITE_REG(hw, FM10K_TXQCTL(vf_q_idx + i), txqctl);

@@ -1215,12 +1215,12 @@ s32 fm10k_iov_msg_msix_pf(struct fm10k_hw *hw, u32 
**results,
 }

 /**
- * fm10k_iov_select_vid - Select correct default vid
+ * fm10k_iov_select_vid - Select correct default VLAN ID
  * @hw: Pointer to hardware structure
- * @vid: vid to correct
+ * @vid: VLAN ID to correct
  *
- * Will report an error if vid is out of range. For vid = 0, it will return
- * either the pf_vid or sw_vid depending on which one is set.
+ * Will report an error if the VLAN ID is out of range. For VID = 0, it will
+ * return either the pf_vid or sw_vid depending on which one is set.
  */
 STATIC s32 fm10k_iov_select_vid(struct fm10k_vf_info *vf_info, u16 vid)
 {
@@ -1783,7 +1783,7 @@ static s32 fm10k_msg_update_pvid_pf(struct fm10k_hw *hw, 
u32 **results,
if (!fm10k_glort_valid_pf(hw, glort))
return FM10K_ERR_PARAM;

-   /* verify VID is valid */
+   /* verify VLAN ID is valid */
if (pvid >= FM10K_VLAN_TABLE_VID_MAX)
return FM10K_ERR_PARAM;

-- 
1.9.3



[dpdk-dev] [PATCH v2 11/16] fm10k/base: allow removal of is_slot_appropriate function

2016-01-27 Thread Wang Xiao W
The Linux Kernel provides the OS a call "pcie_get_minimum_link" which
can crawl the PCIe tree and determine the actual minimum link speed of a
device which is a more general check than provided by
is_slot_appropriate. Thus, the upstream driver does not use or want the
is_slot_appropriate function call. Add a NO_IS_SLOT_APPROPRIATE_CHECK
definition which can be defined during strip process to remove the code.
If left undefined (the default) then the code will all be active and no
driver changes should be necessary.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_api.c  | 2 ++
 drivers/net/fm10k/base/fm10k_api.h  | 2 ++
 drivers/net/fm10k/base/fm10k_pf.c   | 4 
 drivers/net/fm10k/base/fm10k_type.h | 2 ++
 drivers/net/fm10k/base/fm10k_vf.c   | 4 
 5 files changed, 14 insertions(+)

diff --git a/drivers/net/fm10k/base/fm10k_api.c 
b/drivers/net/fm10k/base/fm10k_api.c
index eb5bdaa..c49d20d 100644
--- a/drivers/net/fm10k/base/fm10k_api.c
+++ b/drivers/net/fm10k/base/fm10k_api.c
@@ -181,6 +181,7 @@ s32 fm10k_get_bus_info(struct fm10k_hw *hw)
   FM10K_NOT_IMPLEMENTED);
 }

+#ifndef NO_IS_SLOT_APPROPRIATE_CHECK
 /**
  *  fm10k_is_slot_appropriate - Indicate appropriate slot for this SKU
  *  @hw: pointer to hardware structure
@@ -195,6 +196,7 @@ bool fm10k_is_slot_appropriate(struct fm10k_hw *hw)
return true;
 }

+#endif
 /**
  *  fm10k_update_vlan - Clear VLAN ID to VLAN filter table
  *  @hw: pointer to hardware structure
diff --git a/drivers/net/fm10k/base/fm10k_api.h 
b/drivers/net/fm10k/base/fm10k_api.h
index 113aef5..2ab3149 100644
--- a/drivers/net/fm10k/base/fm10k_api.h
+++ b/drivers/net/fm10k/base/fm10k_api.h
@@ -44,7 +44,9 @@ s32 fm10k_stop_hw(struct fm10k_hw *hw);
 s32 fm10k_start_hw(struct fm10k_hw *hw);
 s32 fm10k_init_shared_code(struct fm10k_hw *hw);
 s32 fm10k_get_bus_info(struct fm10k_hw *hw);
+#ifndef NO_IS_SLOT_APPROPRIATE_CHECK
 bool fm10k_is_slot_appropriate(struct fm10k_hw *hw);
+#endif
 s32 fm10k_update_vlan(struct fm10k_hw *hw, u32 vid, u8 idx, bool set);
 s32 fm10k_read_mac_addr(struct fm10k_hw *hw);
 void fm10k_update_hw_stats(struct fm10k_hw *hw, struct fm10k_hw_stats *stats);
diff --git a/drivers/net/fm10k/base/fm10k_pf.c 
b/drivers/net/fm10k/base/fm10k_pf.c
index a1469aa..f5cbda4 100644
--- a/drivers/net/fm10k/base/fm10k_pf.c
+++ b/drivers/net/fm10k/base/fm10k_pf.c
@@ -216,6 +216,7 @@ STATIC s32 fm10k_init_hw_pf(struct fm10k_hw *hw)
return FM10K_SUCCESS;
 }

+#ifndef NO_IS_SLOT_APPROPRIATE_CHECK
 /**
  *  fm10k_is_slot_appropriate_pf - Indicate appropriate slot for this SKU
  *  @hw: pointer to hardware structure
@@ -231,6 +232,7 @@ STATIC bool fm10k_is_slot_appropriate_pf(struct fm10k_hw 
*hw)
   (hw->bus.width == hw->bus_caps.width);
 }

+#endif
 /**
  *  fm10k_update_vlan_pf - Update status of VLAN ID in VLAN filter table
  *  @hw: pointer to hardware structure
@@ -2064,7 +2066,9 @@ s32 fm10k_init_ops_pf(struct fm10k_hw *hw)
mac->ops.init_hw = _init_hw_pf;
mac->ops.start_hw = _start_hw_generic;
mac->ops.stop_hw = _stop_hw_generic;
+#ifndef NO_IS_SLOT_APPROPRIATE_CHECK
mac->ops.is_slot_appropriate = _is_slot_appropriate_pf;
+#endif
mac->ops.update_vlan = _update_vlan_pf;
mac->ops.read_mac_addr = _read_mac_addr_pf;
mac->ops.update_uc_addr = _update_uc_addr_pf;
diff --git a/drivers/net/fm10k/base/fm10k_type.h 
b/drivers/net/fm10k/base/fm10k_type.h
index c9885a1..ba0a184 100644
--- a/drivers/net/fm10k/base/fm10k_type.h
+++ b/drivers/net/fm10k/base/fm10k_type.h
@@ -679,7 +679,9 @@ struct fm10k_mac_ops {
s32 (*stop_hw)(struct fm10k_hw *);
s32 (*get_bus_info)(struct fm10k_hw *);
s32 (*get_host_state)(struct fm10k_hw *, bool *);
+#ifndef NO_IS_SLOT_APPROPRIATE_CHECK
bool (*is_slot_appropriate)(struct fm10k_hw *);
+#endif
s32 (*update_vlan)(struct fm10k_hw *, u32, u8, bool);
s32 (*read_mac_addr)(struct fm10k_hw *);
s32 (*update_uc_addr)(struct fm10k_hw *, u16, const u8 *,
diff --git a/drivers/net/fm10k/base/fm10k_vf.c 
b/drivers/net/fm10k/base/fm10k_vf.c
index 43eb081..efbdbd1 100644
--- a/drivers/net/fm10k/base/fm10k_vf.c
+++ b/drivers/net/fm10k/base/fm10k_vf.c
@@ -178,6 +178,7 @@ reset_max_queues:
return err;
 }

+#ifndef NO_IS_SLOT_APPROPRIATE_CHECK
 /**
  *  fm10k_is_slot_appropriate_vf - Indicate appropriate slot for this SKU
  *  @hw: pointer to hardware structure
@@ -194,6 +195,7 @@ STATIC bool fm10k_is_slot_appropriate_vf(struct fm10k_hw 
*hw)
return TRUE;
 }

+#endif
 /* This structure defines the attibutes to be parsed below */
 const struct fm10k_tlv_attr fm10k_mac_vlan_msg_attr[] = {
FM10K_TLV_ATTR_U32(FM10K_MAC_VLAN_MSG_VLAN),
@@ -648,7 +650,9 @@ s32 fm10k_init_ops_vf(struct fm10k_hw *hw)
mac->ops.init_hw = _init_hw_vf;
mac->ops.start_hw = _start_hw_generic;
mac->ops.stop_hw = _stop_hw_vf;
+#ifndef NO_IS_SLOT_APPROPRIATE_CHECK

[dpdk-dev] [PATCH v2 10/16] fm10k/base: use memcpy for mac addr copy

2016-01-27 Thread Wang Xiao W
Use memcpy instead of copying MAC address byte-by-byte.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_pf.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/net/fm10k/base/fm10k_pf.c 
b/drivers/net/fm10k/base/fm10k_pf.c
index 7d48210..a1469aa 100644
--- a/drivers/net/fm10k/base/fm10k_pf.c
+++ b/drivers/net/fm10k/base/fm10k_pf.c
@@ -300,7 +300,6 @@ STATIC s32 fm10k_read_mac_addr_pf(struct fm10k_hw *hw)
 {
u8 perm_addr[ETH_ALEN];
u32 serial_num;
-   int i;

DEBUGFUNC("fm10k_read_mac_addr_pf");

@@ -324,10 +323,8 @@ STATIC s32 fm10k_read_mac_addr_pf(struct fm10k_hw *hw)
perm_addr[4] = (u8)(serial_num >> 8);
perm_addr[5] = (u8)(serial_num);

-   for (i = 0; i < ETH_ALEN; i++) {
-   hw->mac.perm_addr[i] = perm_addr[i];
-   hw->mac.addr[i] = perm_addr[i];
-   }
+   memcpy(hw->mac.perm_addr, perm_addr, ETH_ALEN);
+   memcpy(hw->mac.addr, perm_addr, ETH_ALEN);

return FM10K_SUCCESS;
 }
-- 
1.9.3



[dpdk-dev] [PATCH v2 09/16] fm10k/base: do not use CamelCase

2016-01-27 Thread Wang Xiao W
The upstream Linux kernel community prefers avoiding CamelCase in
variables, function names, etc.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_type.h | 14 +++---
 drivers/net/fm10k/fm10k_ethdev.c| 24 
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/net/fm10k/base/fm10k_type.h 
b/drivers/net/fm10k/base/fm10k_type.h
index 387d25b..c9885a1 100644
--- a/drivers/net/fm10k/base/fm10k_type.h
+++ b/drivers/net/fm10k/base/fm10k_type.h
@@ -531,13 +531,13 @@ struct fm10k_hw;
 #endif

 enum fm10k_int_source {
-   fm10k_int_Mailbox   = 0,
-   fm10k_int_PCIeFault = 1,
-   fm10k_int_SwitchUpDown  = 2,
-   fm10k_int_SwitchEvent   = 3,
-   fm10k_int_SRAM  = 4,
-   fm10k_int_VFLR  = 5,
-   fm10k_int_MaxHoldTime   = 6,
+   fm10k_int_mailbox   = 0,
+   fm10k_int_pcie_fault= 1,
+   fm10k_int_switch_up_down= 2,
+   fm10k_int_switch_event  = 3,
+   fm10k_int_sram  = 4,
+   fm10k_int_vflr  = 5,
+   fm10k_int_max_hold_time = 6,
fm10k_int_sources_max_pf
 };

diff --git a/drivers/net/fm10k/fm10k_ethdev.c b/drivers/net/fm10k/fm10k_ethdev.c
index 2c38ce9..a118cf4 100644
--- a/drivers/net/fm10k/fm10k_ethdev.c
+++ b/drivers/net/fm10k/fm10k_ethdev.c
@@ -2074,12 +2074,12 @@ fm10k_dev_enable_intr_pf(struct rte_eth_dev *dev)
/* Bind all local non-queue interrupt to vector 0 */
int_map |= 0;

-   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_Mailbox), int_map);
-   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_PCIeFault), int_map);
-   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_SwitchUpDown), int_map);
-   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_SwitchEvent), int_map);
-   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_SRAM), int_map);
-   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_VFLR), int_map);
+   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_mailbox), int_map);
+   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_pcie_fault), int_map);
+   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_switch_up_down), int_map);
+   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_switch_event), int_map);
+   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_sram), int_map);
+   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_vflr), int_map);

/* Enable misc causes */
FM10K_WRITE_REG(hw, FM10K_EIMR, FM10K_EIMR_ENABLE(PCA_FAULT) |
@@ -2105,12 +2105,12 @@ fm10k_dev_disable_intr_pf(struct rte_eth_dev *dev)

int_map |= 0;

-   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_Mailbox), int_map);
-   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_PCIeFault), int_map);
-   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_SwitchUpDown), int_map);
-   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_SwitchEvent), int_map);
-   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_SRAM), int_map);
-   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_VFLR), int_map);
+   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_mailbox), int_map);
+   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_pcie_fault), int_map);
+   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_switch_up_down), int_map);
+   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_switch_event), int_map);
+   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_sram), int_map);
+   FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_vflr), int_map);

/* Disable misc causes */
FM10K_WRITE_REG(hw, FM10K_EIMR, FM10K_EIMR_DISABLE(PCA_FAULT) |
-- 
1.9.3



[dpdk-dev] [PATCH v2 07/16] fm10k/base: fix checkpatch warning

2016-01-27 Thread Wang Xiao W
Cleanup lines over 80 characters.
Cleanup useless else, checkpatch warns that else is not generally
useful after a break or return.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_mbx.c |  2 +-
 drivers/net/fm10k/base/fm10k_pf.c  | 19 ++-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/net/fm10k/base/fm10k_mbx.c 
b/drivers/net/fm10k/base/fm10k_mbx.c
index 3c9ab3a..7d03704 100644
--- a/drivers/net/fm10k/base/fm10k_mbx.c
+++ b/drivers/net/fm10k/base/fm10k_mbx.c
@@ -930,7 +930,7 @@ STATIC void fm10k_mbx_create_disconnect_hdr(struct 
fm10k_mbx_info *mbx)
 }

 /**
- *  fm10k_mbx_create_fake_disconnect_hdr - Generate a false disconnect mailbox 
header
+ *  fm10k_mbx_create_fake_disconnect_hdr - Generate a false disconnect mbox hdr
  *  @mbx: pointer to mailbox
  *
  *  This function creates a fake disconnect header for loading into remote
diff --git a/drivers/net/fm10k/base/fm10k_pf.c 
b/drivers/net/fm10k/base/fm10k_pf.c
index 6de679e..3ee88b6 100644
--- a/drivers/net/fm10k/base/fm10k_pf.c
+++ b/drivers/net/fm10k/base/fm10k_pf.c
@@ -1278,8 +1278,8 @@ s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *hw, u32 
**results,
err = fm10k_iov_select_vid(vf_info, (u16)vid);
if (err < 0)
return err;
-   else
-   vid = err;
+
+   vid = err;

/* update VSI info for VF in regards to VLAN table */
err = hw->mac.ops.update_vlan(hw, vid, vf_info->vsi, set);
@@ -1304,8 +1304,8 @@ s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *hw, u32 
**results,
err = fm10k_iov_select_vid(vf_info, vlan);
if (err < 0)
return err;
-   else
-   vlan = (u16)err;
+
+   vlan = (u16)err;

/* notify switch of request for new unicast address */
err = hw->mac.ops.update_uc_addr(hw, vf_info->glort,
@@ -1330,8 +1330,8 @@ s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *hw, u32 
**results,
err = fm10k_iov_select_vid(vf_info, vlan);
if (err < 0)
return err;
-   else
-   vlan = (u16)err;
+
+   vlan = (u16)err;

/* notify switch of request for new multicast address */
err = hw->mac.ops.update_mc_addr(hw, vf_info->glort,
@@ -1500,9 +1500,10 @@ STATIC void fm10k_update_hw_stats_pf(struct fm10k_hw *hw,
xec = fm10k_read_hw_stats_32b(hw, FM10K_STATS_XEC, >xec);
vlan_drop = fm10k_read_hw_stats_32b(hw, FM10K_STATS_VLAN_DROP,
>vlan_drop);
-   loopback_drop = fm10k_read_hw_stats_32b(hw,
-   
FM10K_STATS_LOOPBACK_DROP,
-   >loopback_drop);
+   loopback_drop =
+   fm10k_read_hw_stats_32b(hw,
+   FM10K_STATS_LOOPBACK_DROP,
+   >loopback_drop);
nodesc_drop = fm10k_read_hw_stats_32b(hw,
  FM10K_STATS_NODESC_DROP,
  >nodesc_drop);
-- 
1.9.3



[dpdk-dev] [PATCH v2 06/16] fm10k/base: document ITR scale workaround in VF TDLEN register

2016-01-27 Thread Wang Xiao W
Add comments which properly explain the undocumented use of bits in
TDLEN register prior to VF initializing it to the correct value. Note
that the mechanism is entirely software-defined and explain its purpose
to help reduce confusion in the future.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_pf.c   | 6 +-
 drivers/net/fm10k/base/fm10k_type.h | 9 +
 drivers/net/fm10k/base/fm10k_vf.c   | 9 +
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/net/fm10k/base/fm10k_pf.c 
b/drivers/net/fm10k/base/fm10k_pf.c
index 5b8c039..6de679e 100644
--- a/drivers/net/fm10k/base/fm10k_pf.c
+++ b/drivers/net/fm10k/base/fm10k_pf.c
@@ -958,7 +958,8 @@ STATIC s32 fm10k_iov_assign_default_mac_vlan_pf(struct 
fm10k_hw *hw,
FM10K_WRITE_REG(hw, FM10K_TDBAH(vf_q_idx), tdbah);

/* Provide the VF the ITR scale, using software-defined fields in TDLEN
-* to pass the information during VF initialization
+* to pass the information during VF initialization. See definition of
+* FM10K_TDLEN_ITR_SCALE_SHIFT for more details.
 */
FM10K_WRITE_REG(hw, FM10K_TDLEN(vf_q_idx), hw->mac.itr_scale <<
   FM10K_TDLEN_ITR_SCALE_SHIFT);
@@ -1095,6 +1096,9 @@ STATIC s32 fm10k_iov_reset_resources_pf(struct fm10k_hw 
*hw,
for (i = queues_per_pool; i--;) {
FM10K_WRITE_REG(hw, FM10K_TDBAL(vf_q_idx + i), tdbal);
FM10K_WRITE_REG(hw, FM10K_TDBAH(vf_q_idx + i), tdbah);
+   /* See definition of FM10K_TDLEN_ITR_SCALE_SHIFT for an
+* explanation of how TDLEN is used.
+*/
FM10K_WRITE_REG(hw, FM10K_TDLEN(vf_q_idx + i),
hw->mac.itr_scale <<
FM10K_TDLEN_ITR_SCALE_SHIFT);
diff --git a/drivers/net/fm10k/base/fm10k_type.h 
b/drivers/net/fm10k/base/fm10k_type.h
index 44187b1..5db6345 100644
--- a/drivers/net/fm10k/base/fm10k_type.h
+++ b/drivers/net/fm10k/base/fm10k_type.h
@@ -350,6 +350,15 @@ struct fm10k_hw;
 #define FM10K_TDBAL(_n)((0x40 * (_n)) + 0x8000)
 #define FM10K_TDBAH(_n)((0x40 * (_n)) + 0x8001)
 #define FM10K_TDLEN(_n)((0x40 * (_n)) + 0x8002)
+/* When fist initialized, VFs need to know the Interrupt Throttle Rate (ITR)
+ * scale which is based on the PCIe speed but the speed information in the PCI
+ * configuration space may not be accurate. The PF already knows the ITR scale
+ * but there is no defined method to pass that information from the PF to the
+ * VF. This is accomplished during VF initialization by temporarily co-opting
+ * the yet-to-be-used TDLEN register to have the PF store the ITR shift for
+ * the VF to retrieve before the VF needs to use the TDLEN register for its
+ * intended purpose, i.e. before the Tx resources are allocated.
+ */
 #define FM10K_TDLEN_ITR_SCALE_SHIFT9
 #define FM10K_TDLEN_ITR_SCALE_MASK 0x0E00
 #define FM10K_TDLEN_ITR_SCALE_GEN1 2
diff --git a/drivers/net/fm10k/base/fm10k_vf.c 
b/drivers/net/fm10k/base/fm10k_vf.c
index 9b10ee4..43eb081 100644
--- a/drivers/net/fm10k/base/fm10k_vf.c
+++ b/drivers/net/fm10k/base/fm10k_vf.c
@@ -74,6 +74,11 @@ STATIC s32 fm10k_stop_hw_vf(struct fm10k_hw *hw)
FM10K_WRITE_REG(hw, FM10K_TDBAH(i), bah);
FM10K_WRITE_REG(hw, FM10K_RDBAL(i), bal);
FM10K_WRITE_REG(hw, FM10K_RDBAH(i), bah);
+   /* Restore ITR scale in software-defined mechanism in TDLEN
+* for next VF initialization. See definition of
+* FM10K_TDLEN_ITR_SCALE_SHIFT for more details on the use of
+* TDLEN here.
+*/
FM10K_WRITE_REG(hw, FM10K_TDLEN(i), tdlen);
}

@@ -157,6 +162,10 @@ STATIC s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
/* fetch default VLAN and ITR scale */
hw->mac.default_vid = (FM10K_READ_REG(hw, FM10K_TXQCTL(0)) &
   FM10K_TXQCTL_VID_MASK) >> FM10K_TXQCTL_VID_SHIFT;
+   /* Read the ITR scale from TDLEN. See the definition of
+* FM10K_TDLEN_ITR_SCALE_SHIFT for more information about how TDLEN is
+* used here.
+*/
hw->mac.itr_scale = (FM10K_READ_REG(hw, FM10K_TDLEN(0)) &
 FM10K_TDLEN_ITR_SCALE_MASK) >>
FM10K_TDLEN_ITR_SCALE_SHIFT;
-- 
1.9.3



[dpdk-dev] [PATCH v2 05/16] fm10k/base: reset max_queues on init_hw_vf failure

2016-01-27 Thread Wang Xiao W
VF drivers must detect how many queues are available. Previously, the
driver assumed that each VF has at minimum 1 queue. This assumption is
incorrect, since it is possible that the PF has not yet assigned the
queues to the VF by the time the VF checks. To resolve this, we added a
check first to ensure that the first queue is infact owned by the VF at
init_hw_vf time. However, the code flow did not reset hw->mac.max_queues
to 0. In some cases, such as during reinit flows, we call init_hw_vf
without clearing the previous value of hw->mac.max_queues. Due to this,
when init_hw_vf errors out, if its error code is not properly handled
the VF driver may still believe it has queues which no longer belong to
it. Fix this by clearing the hw->mac.max_queues on exit due to errors.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_vf.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/fm10k/base/fm10k_vf.c 
b/drivers/net/fm10k/base/fm10k_vf.c
index 39bc927..9b10ee4 100644
--- a/drivers/net/fm10k/base/fm10k_vf.c
+++ b/drivers/net/fm10k/base/fm10k_vf.c
@@ -128,8 +128,10 @@ STATIC s32 fm10k_init_hw_vf(struct fm10k_hw *hw)

/* verify we have at least 1 queue */
if (!~FM10K_READ_REG(hw, FM10K_TXQCTL(0)) ||
-   !~FM10K_READ_REG(hw, FM10K_RXQCTL(0)))
-   return FM10K_ERR_NO_RESOURCES;
+   !~FM10K_READ_REG(hw, FM10K_RXQCTL(0))) {
+   err = FM10K_ERR_NO_RESOURCES;
+   goto reset_max_queues;
+   }

/* determine how many queues we have */
for (i = 1; tqdloc0 && (i < FM10K_MAX_QUEUES_POOL); i++) {
@@ -147,7 +149,7 @@ STATIC s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
/* shut down queues we own and reset DMA configuration */
err = fm10k_disable_queues_generic(hw, i);
if (err)
-   return err;
+   goto reset_max_queues;

/* record maximum queue count */
hw->mac.max_queues = i;
@@ -160,6 +162,11 @@ STATIC s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
FM10K_TDLEN_ITR_SCALE_SHIFT;

return FM10K_SUCCESS;
+
+reset_max_queues:
+   hw->mac.max_queues = 0;
+
+   return err;
 }

 /**
-- 
1.9.3



[dpdk-dev] [PATCH v2 04/16] fm10k/base: use bitshift for itr_scale

2016-01-27 Thread Wang Xiao W
Upstream community wishes us to use bitshift instead of a divisor,
because this is faster, and prevents any need for a '0' check. In our
case, this even works out because default Gen3 will be 0.

Because of this, we are also able to remove the check for non-zero value
in the vf code path since that will already be the default Gen3 case.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_type.h | 6 +++---
 drivers/net/fm10k/base/fm10k_vf.c   | 4 
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/fm10k/base/fm10k_type.h 
b/drivers/net/fm10k/base/fm10k_type.h
index 62fa73f..44187b1 100644
--- a/drivers/net/fm10k/base/fm10k_type.h
+++ b/drivers/net/fm10k/base/fm10k_type.h
@@ -352,9 +352,9 @@ struct fm10k_hw;
 #define FM10K_TDLEN(_n)((0x40 * (_n)) + 0x8002)
 #define FM10K_TDLEN_ITR_SCALE_SHIFT9
 #define FM10K_TDLEN_ITR_SCALE_MASK 0x0E00
-#define FM10K_TDLEN_ITR_SCALE_GEN1 4
-#define FM10K_TDLEN_ITR_SCALE_GEN2 2
-#define FM10K_TDLEN_ITR_SCALE_GEN3 1
+#define FM10K_TDLEN_ITR_SCALE_GEN1 2
+#define FM10K_TDLEN_ITR_SCALE_GEN2 1
+#define FM10K_TDLEN_ITR_SCALE_GEN3 0
 #define FM10K_TPH_TXCTRL(_n)   ((0x40 * (_n)) + 0x8003)
 #define FM10K_TPH_TXCTRL_DESC_TPHEN0x0020
 #define FM10K_TPH_TXCTRL_DESC_RROEN0x0200
diff --git a/drivers/net/fm10k/base/fm10k_vf.c 
b/drivers/net/fm10k/base/fm10k_vf.c
index 7822ab6..39bc927 100644
--- a/drivers/net/fm10k/base/fm10k_vf.c
+++ b/drivers/net/fm10k/base/fm10k_vf.c
@@ -159,10 +159,6 @@ STATIC s32 fm10k_init_hw_vf(struct fm10k_hw *hw)
 FM10K_TDLEN_ITR_SCALE_MASK) >>
FM10K_TDLEN_ITR_SCALE_SHIFT;

-   /* ensure a non-zero itr scale */
-   if (!hw->mac.itr_scale)
-   hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN3;
-
return FM10K_SUCCESS;
 }

-- 
1.9.3



[dpdk-dev] [PATCH v2 03/16] fm10k/base: cleanup namespace pollution and correct typecast

2016-01-27 Thread Wang Xiao W
Correct typecast in fm10k_update_xc_addr_pf.

Make functions that are only referenced locally static.

And fix the function header comment for fm10k_tlv_attr_nest_stop() while
we're at it.

Wrap fm10k_msg_data fm10k_iov_msg_data_pf[] in the new ifndef
NO_DEFAULT_SRIOV_MSG_HANDLERS so that drivers with custom SR-IOV
message handlers can strip it.

remove unused struct element in struct fm10k_mac_ops.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_pf.c   | 10 ++
 drivers/net/fm10k/base/fm10k_pf.h   |  4 ++--
 drivers/net/fm10k/base/fm10k_tlv.c  | 16 
 drivers/net/fm10k/base/fm10k_tlv.h  |  5 -
 drivers/net/fm10k/base/fm10k_type.h |  1 -
 drivers/net/fm10k/base/fm10k_vf.c   |  2 --
 6 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/drivers/net/fm10k/base/fm10k_pf.c 
b/drivers/net/fm10k/base/fm10k_pf.c
index 6e6d71e..5b8c039 100644
--- a/drivers/net/fm10k/base/fm10k_pf.c
+++ b/drivers/net/fm10k/base/fm10k_pf.c
@@ -379,8 +379,8 @@ STATIC s32 fm10k_update_xc_addr_pf(struct fm10k_hw *hw, u16 
glort,
 ((u32)mac[3] << 16) |
 ((u32)mac[4] << 8) |
 ((u32)mac[5]));
-   mac_update.mac_upper = FM10K_CPU_TO_LE16(((u32)mac[0] << 8) |
-((u32)mac[1]));
+   mac_update.mac_upper = FM10K_CPU_TO_LE16(((u16)mac[0] << 8) |
+  ((u16)mac[1]));
mac_update.vlan = FM10K_CPU_TO_LE16(vid);
mac_update.glort = FM10K_CPU_TO_LE16(glort);
mac_update.action = add ? 0 : 1;
@@ -1457,6 +1457,7 @@ s32 fm10k_iov_msg_lport_state_pf(struct fm10k_hw *hw, u32 
**results,
return err;
 }

+#ifndef NO_DEFAULT_SRIOV_MSG_HANDLERS
 const struct fm10k_msg_data fm10k_iov_msg_data_pf[] = {
FM10K_TLV_MSG_TEST_HANDLER(fm10k_tlv_msg_test),
FM10K_VF_MSG_MSIX_HANDLER(fm10k_iov_msg_msix_pf),
@@ -1465,6 +1466,7 @@ const struct fm10k_msg_data fm10k_iov_msg_data_pf[] = {
FM10K_TLV_MSG_ERROR_HANDLER(fm10k_tlv_msg_error),
 };

+#endif
 /**
  *  fm10k_update_stats_hw_pf - Updates hardware related statistics of PF
  *  @hw: pointer to hardware structure
@@ -1754,8 +1756,8 @@ const struct fm10k_tlv_attr fm10k_update_pvid_msg_attr[] 
= {
  *
  *  This handler configures the default VLAN for the PF
  **/
-s32 fm10k_msg_update_pvid_pf(struct fm10k_hw *hw, u32 **results,
-struct fm10k_mbx_info *mbx)
+static s32 fm10k_msg_update_pvid_pf(struct fm10k_hw *hw, u32 **results,
+   struct fm10k_mbx_info *mbx)
 {
u16 glort, pvid;
u32 pvid_update;
diff --git a/drivers/net/fm10k/base/fm10k_pf.h 
b/drivers/net/fm10k/base/fm10k_pf.h
index 44bd193..92e2962 100644
--- a/drivers/net/fm10k/base/fm10k_pf.h
+++ b/drivers/net/fm10k/base/fm10k_pf.h
@@ -149,8 +149,6 @@ extern const struct fm10k_tlv_attr 
fm10k_lport_map_msg_attr[];
 #define FM10K_PF_MSG_LPORT_MAP_HANDLER(func) \
FM10K_MSG_HANDLER(FM10K_PF_MSG_ID_LPORT_MAP, \
  fm10k_lport_map_msg_attr, func)
-s32 fm10k_msg_update_pvid_pf(struct fm10k_hw *, u32 **,
-struct fm10k_mbx_info *);
 extern const struct fm10k_tlv_attr fm10k_update_pvid_msg_attr[];
 #define FM10K_PF_MSG_UPDATE_PVID_HANDLER(func) \
FM10K_MSG_HANDLER(FM10K_PF_MSG_ID_UPDATE_PVID, \
@@ -183,7 +181,9 @@ s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *, u32 **,
  struct fm10k_mbx_info *);
 s32 fm10k_iov_msg_lport_state_pf(struct fm10k_hw *, u32 **,
 struct fm10k_mbx_info *);
+#ifndef NO_DEFAULT_SRIOV_MSG_HANDLERS
 extern const struct fm10k_msg_data fm10k_iov_msg_data_pf[];
+#endif

 s32 fm10k_init_ops_pf(struct fm10k_hw *hw);
 #endif /* _FM10K_PF_H */
diff --git a/drivers/net/fm10k/base/fm10k_tlv.c 
b/drivers/net/fm10k/base/fm10k_tlv.c
index 1d9d7d8..ade87d1 100644
--- a/drivers/net/fm10k/base/fm10k_tlv.c
+++ b/drivers/net/fm10k/base/fm10k_tlv.c
@@ -63,8 +63,8 @@ s32 fm10k_tlv_msg_init(u32 *msg, u16 msg_id)
  *  the attribute buffer.  It will return success if provided with a valid
  *  pointers.
  **/
-s32 fm10k_tlv_attr_put_null_string(u32 *msg, u16 attr_id,
-  const unsigned char *string)
+static s32 fm10k_tlv_attr_put_null_string(u32 *msg, u16 attr_id,
+ const unsigned char *string)
 {
u32 attr_data = 0, len = 0;
u32 *attr;
@@ -115,7 +115,7 @@ s32 fm10k_tlv_attr_put_null_string(u32 *msg, u16 attr_id,
  *  it in the array pointed by by string.  It will return success if provided
  *  with a valid pointers.
  **/
-s32 fm10k_tlv_attr_get_null_string(u32 *attr, unsigned char *string)
+static s32 fm10k_tlv_attr_get_null_string(u32 *attr, unsigned char *string)
 {
u32 len;

@@ -386,7 +386,7 @@ s32 fm10k_tlv_attr_get_le_struct(u32 *attr, 

[dpdk-dev] [PATCH v2 02/16] fm10k/base: add macro definitions that are needed

2016-01-27 Thread Wang Xiao W
Some macros such as FM10K_RXINT_TIMER_SHIFT are removed in the share
code drop, but they are needed in dpdk/fm10k. This patch put all these
necessary macros into fm10k_osdep.h

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/base/fm10k_osdep.h | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/drivers/net/fm10k/base/fm10k_osdep.h 
b/drivers/net/fm10k/base/fm10k_osdep.h
index 6852ef0..869af1b 100644
--- a/drivers/net/fm10k/base/fm10k_osdep.h
+++ b/drivers/net/fm10k/base/fm10k_osdep.h
@@ -150,6 +150,36 @@ typedef intbool;
 #define fm10k_read_reg FM10K_READ_REG
 #endif

+#define FM10K_INTEL_VENDOR_ID   0x8086
+#define FM10K_DMA_CTRL_MINMSS_SHIFT9
+#define FM10K_EICR_PCA_FAULT   0x0001
+#define FM10K_EICR_THI_FAULT   0x0004
+#define FM10K_EICR_FUM_FAULT   0x0020
+#define FM10K_EICR_SRAMERROR   0x0400
+#define FM10K_SRAM_IP  0x13003
+#define FM10K_RXINT_TIMER_SHIFT8
+#define FM10K_TXINT_TIMER_SHIFT8
+#define FM10K_RXD_PKTTYPE_MASK 0x03F0
+#define FM10K_RXD_PKTTYPE_SHIFT4
+enum fm10k_rdesc_pkt_type {
+   /* L3 type */
+   FM10K_PKTTYPE_OTHER = 0x00,
+   FM10K_PKTTYPE_IPV4  = 0x01,
+   FM10K_PKTTYPE_IPV4_EX   = 0x02,
+   FM10K_PKTTYPE_IPV6  = 0x03,
+   FM10K_PKTTYPE_IPV6_EX   = 0x04,
+
+   /* L4 type */
+   FM10K_PKTTYPE_TCP   = 0x08,
+   FM10K_PKTTYPE_UDP   = 0x10,
+   FM10K_PKTTYPE_GRE   = 0x18,
+   FM10K_PKTTYPE_VXLAN = 0x20,
+   FM10K_PKTTYPE_NVGRE = 0x28,
+   FM10K_PKTTYPE_GENEVE= 0x30
+};
+#define FM10K_RXD_STATUS_IPCS  0x0008 /* Indicates IPv4 csum */
+#define FM10K_RXD_STATUS_HBO   0x0400 /* header buffer overrun */
+
 #define FM10K_TSO_MINMSS \
(FM10K_DMA_CTRL_MINMSS_64 >> FM10K_DMA_CTRL_MINMSS_SHIFT)
 #define FM10K_TSO_MIN_HEADERLEN54
-- 
1.9.3



[dpdk-dev] [PATCH v2 01/16] fm10k: use default mailbox message handler for pf

2016-01-27 Thread Wang Xiao W
The new share code makes fm10k_msg_update_pvid_pf function static, so we can
not refer to it now in fm10k_ethdev.c. The registered pf handler is almost the
same as the default pf handler, removing it has no impact on mailbox.

Signed-off-by: Wang Xiao W 
---
 drivers/net/fm10k/fm10k_ethdev.c | 17 ++---
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/drivers/net/fm10k/fm10k_ethdev.c b/drivers/net/fm10k/fm10k_ethdev.c
index e4aed94..2c38ce9 100644
--- a/drivers/net/fm10k/fm10k_ethdev.c
+++ b/drivers/net/fm10k/fm10k_ethdev.c
@@ -2367,29 +2367,16 @@ static const struct fm10k_msg_data fm10k_msgdata_vf[] = 
{
FM10K_TLV_MSG_ERROR_HANDLER(fm10k_tlv_msg_error),
 };

-/* Mailbox message handler in PF */
-static const struct fm10k_msg_data fm10k_msgdata_pf[] = {
-   FM10K_PF_MSG_ERR_HANDLER(XCAST_MODES, fm10k_msg_err_pf),
-   FM10K_PF_MSG_ERR_HANDLER(UPDATE_MAC_FWD_RULE, fm10k_msg_err_pf),
-   FM10K_PF_MSG_LPORT_MAP_HANDLER(fm10k_msg_lport_map_pf),
-   FM10K_PF_MSG_ERR_HANDLER(LPORT_CREATE, fm10k_msg_err_pf),
-   FM10K_PF_MSG_ERR_HANDLER(LPORT_DELETE, fm10k_msg_err_pf),
-   FM10K_PF_MSG_UPDATE_PVID_HANDLER(fm10k_msg_update_pvid_pf),
-   FM10K_TLV_MSG_ERROR_HANDLER(fm10k_tlv_msg_error),
-};
-
 static int
 fm10k_setup_mbx_service(struct fm10k_hw *hw)
 {
-   int err;
+   int err = 0;

/* Initialize mailbox lock */
fm10k_mbx_initlock(hw);

/* Replace default message handler with new ones */
-   if (hw->mac.type == fm10k_mac_pf)
-   err = hw->mbx.ops.register_handlers(>mbx, fm10k_msgdata_pf);
-   else
+   if (hw->mac.type == fm10k_mac_vf)
err = hw->mbx.ops.register_handlers(>mbx, fm10k_msgdata_vf);

if (err) {
-- 
1.9.3



[dpdk-dev] [PATCH v2 00/16] fm10k: update shared code

2016-01-27 Thread Wang Xiao W
v2:
* Put the two extra fix patches ahead of the base code patches.

Wang Xiao W (16):
  fm10k: use default mailbox message handler for pf
  fm10k/base: add macro definitions that are needed
  fm10k/base: cleanup namespace pollution and correct typecast
  fm10k/base: use bitshift for itr_scale
  fm10k/base: reset max_queues on init_hw_vf failure
  fm10k/base: document ITR scale workaround in VF TDLEN register
  fm10k/base: fix checkpatch warning
  fm10k/base: use BIT macro instead of open-coded bit-shifting of 1
  fm10k/base: do not use CamelCase
  fm10k/base: use memcpy for mac addr copy
  fm10k/base: allow removal of is_slot_appropriate function
  fm10k/base: consistently use VLAN ID when referencing vid variables
  fm10k/base: fix comment per upstream review changes
  fm10k/base: TLV structures must be 4byte aligned, not 1byte aligned
  fm10k/base: move constants to the right of binary operators
  fm10k/base: minor cleanups

 drivers/net/fm10k/base/fm10k_api.c   |   2 +
 drivers/net/fm10k/base/fm10k_api.h   |   2 +
 drivers/net/fm10k/base/fm10k_mbx.c   |  63 +++-
 drivers/net/fm10k/base/fm10k_mbx.h   |  11 +--
 drivers/net/fm10k/base/fm10k_osdep.h |  30 ++
 drivers/net/fm10k/base/fm10k_pf.c|  88 +
 drivers/net/fm10k/base/fm10k_pf.h|  18 ++--
 drivers/net/fm10k/base/fm10k_tlv.c   |  40 
 drivers/net/fm10k/base/fm10k_tlv.h   |   9 +-
 drivers/net/fm10k/base/fm10k_type.h  | 182 +++
 drivers/net/fm10k/base/fm10k_vf.c|  32 --
 drivers/net/fm10k/fm10k_ethdev.c |  41 +++-
 12 files changed, 220 insertions(+), 298 deletions(-)

-- 
1.9.3



[dpdk-dev] [PATCH v5 8/9] virtio: add 1.0 support

2016-01-27 Thread Yuanhan Liu
On Thu, Jan 21, 2016 at 12:37:42PM +0100, Thomas Monjalon wrote:
> 2016-01-19 16:12, Yuanhan Liu:
> > +#define IO_READ_DEF(nr_bits, type) \
> > +static inline type \
> > +io_read##nr_bits(type *addr)   \
> > +{  \
> > +   return *(volatile type *)addr;  \
> > +}
> > +
> > +#define IO_WRITE_DEF(nr_bits, type)\
> > +static inline void \
> > +io_write##nr_bits(type val, type *addr)\
> > +{  \
> > +   *(volatile type *)addr = val;   \
> > +}
> > +
> > +IO_READ_DEF (8, uint8_t)
> > +IO_WRITE_DEF(8, uint8_t)
> > +
> > +IO_READ_DEF (16, uint16_t)
> > +IO_WRITE_DEF(16, uint16_t)
> > +
> > +IO_READ_DEF (32, uint32_t)
> > +IO_WRITE_DEF(32, uint32_t)
> 
> Yes you can do this.
> But not sure you should.
> 
> > +static inline void
> > +io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
> > +{
> > +   io_write32(val & ((1ULL << 32) - 1), lo);
> > +   io_write32(val >> 32,hi);
> > +}
> 
> When debugging this code, how GDB behave?
> How to find the definition of io_write32() with grep or simple editors?

Okay, I will unfold them.

--yliu


[dpdk-dev] [PATCH v2] ip_pipeline: fix cpu socket-id error

2016-01-27 Thread Jasvinder Singh
This patch fixes the socket-id error in ip_pipeline sample
application running over uni-processor systems.

Signed-off-by: Jasvinder Singh 
Acked-by: Cristian Dumitrescu 
---
v2:
- used SOCKET_ID_ANY instead of -1

 examples/ip_pipeline/init.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/examples/ip_pipeline/init.c b/examples/ip_pipeline/init.c
index 186ca03..c4601c9 100644
--- a/examples/ip_pipeline/init.c
+++ b/examples/ip_pipeline/init.c
@@ -835,6 +835,14 @@ app_init_link_frag_ras(struct app_params *app)
}
 }

+static inline int
+app_get_cpu_socket_id(uint32_t pmd_id)
+{
+   int status = rte_eth_dev_socket_id(pmd_id);
+
+   return (status != SOCKET_ID_ANY) ? status : 0;
+}
+
 static void
 app_init_link(struct app_params *app)
 {
@@ -890,7 +898,7 @@ app_init_link(struct app_params *app)
p_link->pmd_id,
rxq_queue_id,
p_rxq->size,
-   rte_eth_dev_socket_id(p_link->pmd_id),
+   app_get_cpu_socket_id(p_link->pmd_id),
_rxq->conf,
app->mempool[p_rxq->mempool_id]);
if (status < 0)
@@ -917,7 +925,7 @@ app_init_link(struct app_params *app)
p_link->pmd_id,
txq_queue_id,
p_txq->size,
-   rte_eth_dev_socket_id(p_link->pmd_id),
+   app_get_cpu_socket_id(p_link->pmd_id),
_txq->conf);
if (status < 0)
rte_panic("%s (%" PRIu32 "): "
@@ -989,7 +997,7 @@ app_init_tm(struct app_params *app)
/* TM */
p_tm->sched_port_params.name = p_tm->name;
p_tm->sched_port_params.socket =
-   rte_eth_dev_socket_id(p_link->pmd_id);
+   app_get_cpu_socket_id(p_link->pmd_id);
p_tm->sched_port_params.rate =
(uint64_t) link_eth_params.link_speed * 1000 * 1000 / 8;

-- 
2.5.0



[dpdk-dev] [PATCH v5 8/9] virtio: add 1.0 support

2016-01-27 Thread Yuanhan Liu
On Thu, Jan 21, 2016 at 12:49:10PM +0100, Thomas Monjalon wrote:
> 2016-01-19 16:12, Yuanhan Liu:
> >  int
> >  vtpci_init(struct rte_pci_device *dev, struct virtio_hw *hw)
> >  {
> > -   hw->vtpci_ops = _ops;
> > +   hw->dev = dev;
> > +
> > +   /*
> > +* Try if we can succeed reading virtio pci caps, which exists
> > +* only on modern pci device. If failed, we fallback to legacy
> > +* virtio handling.
> > +*/
> > +   if (virtio_read_caps(dev, hw) == 0) {
> > +   PMD_INIT_LOG(INFO, "modern virtio pci detected.");
> > +   hw->vtpci_ops = _ops;
> > +   hw->modern= 1;
> > +   dev->driver->drv_flags |= RTE_PCI_DRV_INTR_LSC;
> > +   return 0;
> > +   }
> 
> RTE_PCI_DRV_INTR_LSC is already set by virtio_resource_init_by_uio().

We don't go that far here. Here we just detect if it's a modern virtio
device. And if yes, we do some modern initiations, and return.

virtio_resource_init_by_uio() is invoked when virtio_read_caps() fails.

> Do you mean interrupt was not supported with legacy virtio?

Nope. this patch set changes nothing on legacy virtio support.

--yliu


[dpdk-dev] DPDK OVS on Ubuntu 14.04# Issue's Resolved# Inter-VM communication & IP allocation through DHCP issue

2016-01-27 Thread Czesnowicz, Przemyslaw
Hi Abhijeet,


It seems you are almost there!
When booting the VM?s do you request hugepage memory for them (by setting 
hw:mem_page_size=large in flavor extra_spec)?
If not then please do, if yes then please look into libvirt logfiles for the 
VM?s (in /var/log/libvirt/qemu/instance-xxx), I think there could be a clue.


Regards
Przemek

From: Abhijeet Karve [mailto:abhijeet.ka...@tcs.com]
Sent: Monday, January 25, 2016 6:13 PM
To: Czesnowicz, Przemyslaw
Cc: dev at dpdk.org; discuss at openvswitch.org; Gray, Mark D
Subject: RE: [dpdk-dev] DPDK OVS on Ubuntu 14.04# Issue's Resolved# Inter-VM 
communication & IP allocation through DHCP issue

Hi Przemek,

Thank you for your response, It really provided us breakthrough.

After setting up DPDK on compute node for stable/kilo, We are trying to set up 
Openstack stable/liberty all-in-one setup, At present we are not able to get 
the IP allocation for the vhost type instances through DHCP. Also we tried 
assigning IP's manually to them but the inter-VM communication also not 
happening,

#neutron agent-list
root at nfv-dpdk-devstack:/etc/neutron# neutron agent-list
+--++---+---++---+
| id   | agent_type | host  
| alive | admin_state_up | binary|
+--++---+---++---+
| 3b29e93c-3a25-4f7d-bf6c-6bb309db5ec0 | DPDK OVS Agent | nfv-dpdk-devstack 
| :-)   | True   | neutron-openvswitch-agent |
| 62593b2c-c10f-4d93-8551-c46ce24895a6 | L3 agent   | nfv-dpdk-devstack 
| :-)   | True   | neutron-l3-agent  |
| 7cb97af9-cc20-41f8-90fb-aba97d39dfbd | DHCP agent | nfv-dpdk-devstack 
| :-)   | True   | neutron-dhcp-agent|
| b613c654-99b7-437e-9317-20fa651a1310 | Linux bridge agent | nfv-dpdk-devstack 
| :-)   | True   | neutron-linuxbridge-agent |
| c2dd0384-6517-4b44-9c25-0d2825d23f57 | Metadata agent | nfv-dpdk-devstack 
| :-)   | True   | neutron-metadata-agent|
| f23dde40-7dc0-4f20-8b3e-eb90ddb15e49 | Open vSwitch agent | nfv-dpdk-devstack 
| xxx   | True   | neutron-openvswitch-agent |
+--++---+---++---+


ovs-vsctl show output#

Bridge br-dpdk
Port br-dpdk
Interface br-dpdk
type: internal
Port phy-br-dpdk
Interface phy-br-dpdk
type: patch
options: {peer=int-br-dpdk}
Bridge br-int
fail_mode: secure
Port "vhufa41e799-f2"
tag: 5
Interface "vhufa41e799-f2"
type: dpdkvhostuser
Port int-br-dpdk
Interface int-br-dpdk
type: patch
options: {peer=phy-br-dpdk}
Port "tap4e19f8e1-59"
tag: 5
Interface "tap4e19f8e1-59"
type: internal
Port "vhu05734c49-3b"
tag: 5
Interface "vhu05734c49-3b"
type: dpdkvhostuser
Port "vhu10c06b4d-84"
tag: 5
Interface "vhu10c06b4d-84"
type: dpdkvhostuser
Port patch-tun
Interface patch-tun
type: patch
options: {peer=patch-int}
Port "vhue169c581-ef"
tag: 5
Interface "vhue169c581-ef"
type: dpdkvhostuser
Port br-int
Interface br-int
type: internal
Bridge br-tun
fail_mode: secure
Port br-tun
Interface br-tun
type: internal
error: "could not open network device br-tun (Invalid argument)"
Port patch-int
Interface patch-int
type: patch
options: {peer=patch-tun}
ovs_version: "2.4.0"



ovs-ofctl dump-flows br-int#

root at nfv-dpdk-devstack:/etc/neutron# ovs-ofctl dump-flows br-int
NXST_FLOW reply (xid=0x4):
 cookie=0xaaa002bb2bcf827b, duration=2410.012s, table=0, n_packets=0, 
n_bytes=0, idle_age=2410, priority=10,icmp6,in_port=43,icmp_type=136 
actions=resubmit(,24)
 cookie=0xaaa002bb2bcf827b, duration=2409.480s, table=0, n_packets=0, 
n_bytes=0, idle_age=2409, priority=10,icmp6,in_port=44,icmp_type=136 
actions=resubmit(,24)
 cookie=0xaaa002bb2bcf827b, duration=2408.704s, table=0, n_packets=0, 
n_bytes=0, idle_age=2408, priority=10,icmp6,in_port=45,icmp_type=136 
actions=resubmit(,24)
 cookie=0xaaa002bb2bcf827b, duration=2408.155s, table=0, n_packets=0, 
n_bytes=0, idle_age=2408, 

[dpdk-dev] [PATCH 2/5] vhost: refactor virtio_dev_rx

2016-01-27 Thread Yuanhan Liu
On Thu, Jan 21, 2016 at 02:50:01PM +0100, J?r?me Jutteau wrote:
> Hi Yuanhan,
> 
> 2015-12-14 2:47 GMT+01:00 Yuanhan Liu :
> > Right, I should move it in the beginning of this function.
> 
> Any news about this refactoring ?

Hi J?r?me,

Thanks for showing interests in this patch set; I was waiting for
Huawei's comments. And fortunately, he starts making comments.

--yliu


[dpdk-dev] [PATCH 1/5] vhost: refactor rte_vhost_dequeue_burst

2016-01-27 Thread Yuanhan Liu
On Tue, Jan 26, 2016 at 10:30:12AM +, Xie, Huawei wrote:
> On 12/3/2015 2:03 PM, Yuanhan Liu wrote:
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  lib/librte_vhost/vhost_rxtx.c | 287 
> > +-
> >  1 file changed, 113 insertions(+), 174 deletions(-)
> 
> Prefer to unroll copy_mbuf_to_desc and your COPY macro. It prevents us

I'm okay to unroll COPY macro. But for copy_mbuf_to_desc, I prefer not
to do that, unless it has a good reason.

> processing descriptors in a burst way in future.

So, do you have a plan?

--yliu


[dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode

2016-01-27 Thread David Marchand
On Wed, Jan 27, 2016 at 11:12 AM, Thomas Monjalon
 wrote:
> 2016-01-27 10:08, Burakov, Anatoly:
>> > Why a new file for these functions?
>>
>> Well, my thought was to make future extensions easier by way of avoiding 
>> mixing irrelevant and/or general code with driver-specific code. I can 
>> change it back if that's not OK.
>
> No strong opinion here.
> David?

Hum, no strong opinion either, but I don't think we really need to
split this file for this much code.
Besides, if we keep all code in eal_pci_vfio.c, there is no need to
expose those structures through eal_pci_init.h.


-- 
David Marchand


[dpdk-dev] [PATCH 4/5] vhost: do not use rte_memcpy for virtio_hdr copy

2016-01-27 Thread Yuanhan Liu
On Wed, Jan 27, 2016 at 02:46:39AM +, Xie, Huawei wrote:
> On 12/3/2015 2:03 PM, Yuanhan Liu wrote:
> > +   if (vq->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) {
> > +   *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
> > +   } else {
> > +   *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
> > +   }
> 
> Thanks!
> We might simplify this further. Just reset the first two fields flags
> and gso_type.

What's this "simplification" for? Don't even to say that we will add
TSO support, which modifies few more files, such as csum_start: reseting
the first two fields only is wrong here.

--yliu


[dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode

2016-01-27 Thread Thomas Monjalon
2016-01-27 10:08, Burakov, Anatoly:
> > Why a new file for these functions?
> 
> Well, my thought was to make future extensions easier by way of avoiding 
> mixing irrelevant and/or general code with driver-specific code. I can change 
> it back if that's not OK.

No strong opinion here.
David?


[dpdk-dev] bnx2x driver and 57800 versus 57810

2016-01-27 Thread Chas Williams
On Wed, 2016-01-27 at 07:32 +, Harish Patil wrote:
> >
> >I have to practically identical systems, same hypervisor on each
> (Centos
> >7.x).??In one, I have a 57800 card which works fine with DPDK with
> >SRIOV.??In the other, I have a 57810 card which doesn't work with
> SRIOV.
> >
> >For the 57810 I have tracked this down to the status block in the VF
> >failing to be updated.??The linux driver works fine but it appears to
> >use a slightly different scheme -- writing some sort of fastpath
> status
> >block generation per interrupt.
> >
> >Does anyone have any suggestions or a programming guide for this
> device?
> >
> >
>?
> What is not working with 57810? Is it link related or traffic? Please
> provide the details.
> Attached is the SW programming guide for 577xx/578xx. I?m not sure if
> it has details pertaining to the specific issue that you have.

The DPDK PMD driver seems to be able to transmit packets on the 57810.
But since the status block isn't getting updated, you can't reclaim the
sent buffers.??I modified the driver to use the marker based receive
detection (similar to the method used in the Linux driver) and I can see
packets getting received (certainly broadcast is received -- possibly
not unicast packets though which seems to indicate that part of the
RX path is possibly still broken).

I have tried a couple things.??The status page in the DPDK PMD driver
isn't getting page aligned (as well as a bunch of other structures
that should probably be page aligned). The Linux driver happens to do
this as a side effect of the DMA allocator.??Fixing this didn't seem to
improve matters though.??The status block doesn't seem to get updated.
I verified that the correct DMA address is getting passed to the PF.
And since it works on the 57800, I thought perhaps something changed.

Also, the DPDK driver probably gets the RX/TX queue indices wrong during
initial setup.??The final values coming out of the allocation loop
are probably bigger than they should be.??Should they point to the end
of the queue or just past the end???Also, the tail of the queue needs
to be corrected for the double entry at the end of the pages.??Again,
fixing this didn't seem to help either.

The VF-PF interaction seems to be ok as well.??Other than not supporting
SGE, the DPDK PMD driver seems to send reasonably correct messages to
the PF.

I don't see the DPDK PMD driver doing anything to 'reset' the PCI apsect
of the VF.??If there is any left over configuration for interrupts,
like leaving the IGU enabled that maybe not be cleared, I am not sure
what the interaction might be.??I do know the Linux driver does seem
to use MSI-X interrupts.

> Thanks,
> Harish

Thanks for looking at this and thanks for the programming guide.??It
will
take me a bit to digest it.


[dpdk-dev] [PATCH v2 1/2] ethdev: remove useless null checks

2016-01-27 Thread David Marchand
On Tue, Jan 26, 2016 at 4:50 PM, Jan Viktorin  
wrote:
> What about the RTE_VERIFY? I think, it's more appropriate here.

Well, here, I am removing useless checks in static functions.

But for the rest of ethdev api, I agree we could add some RTE_VERIFY.

> Otherwise, feel free to add:
>
> Reviewed-by: Jan Viktorin 

Thanks.


-- 
David Marchand


[dpdk-dev] [PATCH] eal: add function to check if primary proc alive

2016-01-27 Thread Van Haaren, Harry
> From: Richardson, Bruce
> > Agreed, however hiding it totally removes the flexibility of waiting for a 
> > primary
> > that is starting with --file-prefix (aka: in a non-default location). 
> > Imposing
> > a limit on only monitoring primary procs in the default location seems 
> > wrong.
> 
> But the secondary also needs the same prefix. Is that prefix not accessible by
> this function to be used?

The issue is that the EAL parsing code is performed during rte_init(), which
is exactly what this function tries to avoid - initializing EAL before a primary
process starts.

I looked at changing the EAL parsing to come before rte_init(), and considered
adding a minimal parser for --file-prefix. Both routes seem a bad solution,
either for complexity or code-duplication.

v2 of this patch posted to list:
http://dpdk.org/dev/patchwork/patch/10126/

-Harry


[dpdk-dev] [PATCH] log: add missing symbol

2016-01-27 Thread Thomas Monjalon
2015-12-16 16:38, Stephen Hemminger:
> rte_get_log_type and rte_get_log_level functions has been avaliable
> for many versions. But they are missing from the shared library map
> and therefore do not get exported correctly.
> 
> Signed-off-by: Stephen Hemminger 
> ---
>  lib/librte_eal/linuxapp/eal/rte_eal_version.map | 2 ++
>  1 file changed, 2 insertions(+)

Why only in linuxapp?

> diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map 
> b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
> index cbe175f..51a241c 100644
> --- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
> +++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
> @@ -93,7 +93,9 @@ DPDK_2.0 {
>   rte_realloc;
>   rte_set_application_usage_hook;
>   rte_set_log_level;
> + rte_get_log_level;
>   rte_set_log_type;
> + rte_get_log_type;

We try to keep an alphabetical order :)



[dpdk-dev] [PATCH v5 01/11] virtio: Introduce config RTE_VIRTIO_INC_VECTOR

2016-01-27 Thread Yuanhan Liu
On Wed, Jan 27, 2016 at 07:53:21AM +0530, Santosh Shukla wrote:
> Ping?

I was on vacation late last week. And I was quite busy till now after
the vacation. So, sorry that I still don't have time to do more detailed
reviews in 1 or 2 days. Hopefully I can make it by this Friday.

BTW, I had a very glimpse of this patchset, overall, it looks much
better now, except the EAL changes (I'm not the maintainer) and the
virtio io port read/write stuff: Tetsuay suggested to add another
access wraps, but I have few concerns about that. Anyway, I don't
have time for deeper thoughts, and I will re-think it later.

--yliu


[dpdk-dev] [PATCH v2] eal: add function to check if primary proc alive

2016-01-27 Thread Harry van Haaren
This patch adds a new function to the EAL API:
int rte_eal_primary_proc_alive(const char *path);

The function indicates if a primary process is alive right now.
This functionality is implemented by testing for a write-
lock on the config file, and the function tests for a lock.

The use case for this functionality is that a secondary
process can wait until a primary process starts by polling
the function and waiting. When the primary is running, the
secondary continues to poll to detect if the primary process
has quit unexpectedly, the secondary process can detect this.

The RTE_MAGIC number is written to the shared config by the
primary process, this is the signal to the secondary process
that the EAL is set up, and ready to be used. The function
rte_eal_mcfg_complete() writes RTE_MAGIC. This has been
delayed in the EAL init proceedure, as the PCI probing in
the primary process can interfere with the secondary running.

Signed-off-by: Harry van Haaren 
---

v2:
- Passing NULL as const char* uses default /var/run/.rte_config
- Moved code into /common/ instead of /linuxapp/, should work on BSD now

 doc/guides/rel_notes/release_2_3.rst|  7 +++
 lib/librte_eal/bsdapp/eal/Makefile  |  1 +
 lib/librte_eal/bsdapp/eal/rte_eal_version.map   |  8 
 lib/librte_eal/common/eal_common_proc.c | 61 +
 lib/librte_eal/common/include/rte_eal.h | 18 
 lib/librte_eal/linuxapp/eal/Makefile|  1 +
 lib/librte_eal/linuxapp/eal/eal.c   |  4 +-
 lib/librte_eal/linuxapp/eal/rte_eal_version.map |  7 +++
 8 files changed, 105 insertions(+), 2 deletions(-)
 create mode 100644 lib/librte_eal/common/eal_common_proc.c

diff --git a/doc/guides/rel_notes/release_2_3.rst 
b/doc/guides/rel_notes/release_2_3.rst
index 99de186..14b5b06 100644
--- a/doc/guides/rel_notes/release_2_3.rst
+++ b/doc/guides/rel_notes/release_2_3.rst
@@ -11,6 +11,13 @@ Resolved Issues
 EAL
 ~~~

+* **Added rte_eal_primary_proc_alive() function**
+
+  A new function ``rte_eal_primary_proc_alive()`` has been added
+  to allow the user to detect if a primary process is running.
+  Use cases for this feature include fault detection, and monitoring
+  using secondary processes.
+

 Drivers
 ~~~
diff --git a/lib/librte_eal/bsdapp/eal/Makefile 
b/lib/librte_eal/bsdapp/eal/Makefile
index 65b293f..2d6e3b1 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -61,6 +61,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_alarm.c

 # from common dir
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_common_lcore.c
+SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_common_proc.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_common_timer.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_common_memzone.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_common_log.c
diff --git a/lib/librte_eal/bsdapp/eal/rte_eal_version.map 
b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
index 9d7adf1..0e28017 100644
--- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
@@ -135,3 +135,11 @@ DPDK_2.2 {
rte_xen_dom0_supported;

 } DPDK_2.1;
+
+
+DPDK_2.3 {
+   global:
+
+   rte_eal_primary_proc_alive;
+
+} DPDK_2.2;
diff --git a/lib/librte_eal/common/eal_common_proc.c 
b/lib/librte_eal/common/eal_common_proc.c
new file mode 100644
index 000..c598891
--- /dev/null
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -0,0 +1,61 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2016 Intel Shannon Ltd. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 

[dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode

2016-01-27 Thread Burakov, Anatoly
> >> > Why a new file for these functions?
> >>
> >> Well, my thought was to make future extensions easier by way of
> avoiding mixing irrelevant and/or general code with driver-specific code. I 
> can
> change it back if that's not OK.
> >
> > No strong opinion here.
> > David?
> 
> Hum, no strong opinion either, but I don't think we really need to split this
> file for this much code.
> Besides, if we keep all code in eal_pci_vfio.c, there is no need to expose
> those structures through eal_pci_init.h.

OK then, I'll merge it back into the eal_pci_vfio.c

Thanks,
Anatoly


[dpdk-dev] [PATCH] ethdev: fix byte order inconsistence between fdir flow and mask

2016-01-27 Thread Thomas Monjalon
2016-01-27 16:37, Jingjing Wu:
> Fixed issue of byte order in ethdev library that the structure
> for setting fdir's mask and flow entry is inconsist and made
> inputs of mask be in big endian.

Please be more precise. Which one is big endian?
Wasn't it tested before?

> fixes: 76c6f89e80d4 ("ixgbe: support new flow director masks")
>2d4c1a9ea2ac ("ethdev: add new flow director masks")

Please put Fixes: on the two lines.

> --- a/doc/guides/rel_notes/release_2_3.rst
> +++ b/doc/guides/rel_notes/release_2_3.rst
> @@ -19,6 +19,10 @@ Drivers
>  Libraries
>  ~
>  
> +* ** fix byte order inconsistence between fdir flow and mask **
> +
> +  Fixed issue in ethdev library that the structure for setting
> +  fdir's mask and flow entry is inconsist in byte order.

John, comment on release notes formatting?
It's important to have the first items well formatted.

> @@ -39,6 +43,8 @@ API Changes
>  ABI Changes
>  ---
>  
> +* The fields in  The ethdev structures ``rte_eth_fdir_masks`` were
> +  changed to be in big endian.

Please take care of uppercase typo here.

> - /* write all the same so that UDP, TCP and SCTP use the same mask */
> + /* write all the same so that UDP, TCP and SCTP use the same mask
> +  * (little-endian)
> + */

Spacing typo here.
Sorry for the nits ;)

> - uint8_t mac_addr_byte_mask;  /** Per byte MAC address mask */
> + uint8_t mac_addr_byte_mask;  /** Bit mask for associated byte */
>   uint32_t tunnel_id_mask;  /** tunnel ID mask */
> - uint8_t tunnel_type_mask;
> + uint8_t tunnel_type_mask; /**< 1 - Match tunnel type,
> +0 - Ignore tunnel type. */

These changes seem unrelated with the patch.
It's good to improve doc of this API but it's maybe not enough.
Example:
uint8_t mac_addr_byte_mask;  /** Bit mask for associated byte */
Are we sure everybody understand how to fill it?


[dpdk-dev] [PATCH] ip_pipeline: add load balancing function to pass-through pipeline

2016-01-27 Thread Jasvinder Singh
The passthrough pipeline implementation is extended with load balancing
function. This function allows uniform distribution of the packets among
its output ports. For packets distribution, any application level logic
can be applied. For instance, in this implementation, hash value
computed over specific header fields of the incoming packets has been
used to spread traffic uniformly among the output ports. The following
passthrough configuration can be used for implementing load balancing
function over ipv4 traffic;

[PIPELINE0]
type = PASS-THROUGH
core = 0
pktq_in = RXQ0.0 RXQ1.0 RXQ2.0 RXQ3.0
pktq_out = TXQ0.0 TXQ1.0 TXQ2.0 TXQ3.0
dma_src_offset = 278; mbuf (128) + headroom (128) + 1st ethertype offset (14) + 
ttl offset within ip header = 278 (ipv4)
dma_dst_offset = 128; mbuf (128)
dma_size = 16
dma_src_mask = 00FF
dma_hash_offset = 144; (dma_dst_offset+dma_size)
lb = hash

Signed-off-by: Jasvinder Singh 
Acked-by: Cristian Dumitrescu 
---
 .../ip_pipeline/pipeline/pipeline_actions_common.h |  22 ++
 .../ip_pipeline/pipeline/pipeline_passthrough_be.c | 281 -
 .../ip_pipeline/pipeline/pipeline_passthrough_be.h |   2 +
 3 files changed, 245 insertions(+), 60 deletions(-)

diff --git a/examples/ip_pipeline/pipeline/pipeline_actions_common.h 
b/examples/ip_pipeline/pipeline/pipeline_actions_common.h
index 9958758..2c08db2 100644
--- a/examples/ip_pipeline/pipeline/pipeline_actions_common.h
+++ b/examples/ip_pipeline/pipeline/pipeline_actions_common.h
@@ -59,6 +59,28 @@ f_ah(
\
return 0;   \
 }

+#define PIPELINE_PORT_IN_AH_LB(f_ah, f_pkt_work, f_pkt4_work) \
+static int \
+f_ah(  \
+   struct rte_pipeline *p, \
+   struct rte_mbuf **pkts, \
+   uint32_t n_pkts,\
+   void *arg)  \
+{  \
+   uint32_t i; \
+   \
+   uint64_t pkt_mask = RTE_LEN2MASK(n_pkts, uint64_t); \
+   \
+   rte_pipeline_ah_packet_hijack(p, pkt_mask); \
+   for (i = 0; i < (n_pkts & (~0x3LLU)); i += 4)   \
+   f_pkt4_work([i], arg); \
+   \
+   for ( ; i < n_pkts; i++)\
+   f_pkt_work(pkts[i], arg);   \
+   \
+   return 0;   \
+}
+
 #define PIPELINE_TABLE_AH_HIT(f_ah, f_pkt_work, f_pkt4_work)   \
 static int \
 f_ah(  \
diff --git a/examples/ip_pipeline/pipeline/pipeline_passthrough_be.c 
b/examples/ip_pipeline/pipeline/pipeline_passthrough_be.c
index 7642462..75b6fd8 100644
--- a/examples/ip_pipeline/pipeline/pipeline_passthrough_be.c
+++ b/examples/ip_pipeline/pipeline/pipeline_passthrough_be.c
@@ -72,7 +72,9 @@ pkt_work(
struct rte_mbuf *pkt,
void *arg,
uint32_t dma_size,
-   uint32_t hash_enabled)
+   uint32_t hash_enabled,
+   uint32_t lb_hash,
+   uint32_t port_out_pw2)
 {
struct pipeline_passthrough *p = arg;

@@ -90,8 +92,24 @@ pkt_work(
dma_dst[i] = dma_src[i] & dma_mask[i];

/* Read (dma_dst), compute (hash), write (hash) */
-   if (hash_enabled)
-   *dma_hash = p->f_hash(dma_dst, dma_size, 0);
+   if (hash_enabled) {
+   uint32_t hash = p->f_hash(dma_dst, dma_size, 0);
+   *dma_hash = hash;
+
+   if (lb_hash) {
+   uint32_t port_out;
+
+   if (port_out_pw2)
+   port_out
+   = hash & (p->p.n_ports_out - 1);
+   else
+   port_out
+   = hash % p->p.n_ports_out;
+
+   rte_pipeline_port_out_packet_insert(p->p.p,
+   port_out, pkt);
+   }
+   }
 }

 static inline __attribute__((always_inline)) void
@@ -99,7 +117,9 @@ pkt4_work(
struct rte_mbuf **pkts,
void *arg,
uint32_t dma_size,
-   uint32_t hash_enabled)
+   uint32_t hash_enabled,
+   uint32_t lb_hash,
+   

[dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode

2016-01-27 Thread Thomas Monjalon
Hi Anatoly,

Few small comments.

The comments "function pointer typedef" or "structure to hold" don't
bring new information. Please keep it short.

2016-01-13 12:36, Anatoly Burakov:
> +/* function pointer typedef for DMA mapping functions */

->  DMA mapping function type
It would be relevant to describe the return and the parameter.

> +typedef  int (*vfio_dma_func_t)(int);
> +
> +/* Structure to hold supported IOMMU types */

This comment seems useless.

> +struct vfio_iommu_type {

[...]
> +/* function prototypes for different IOMMU types */

idem

> +int vfio_iommu_type1_dma_map(int container_fd);
> +int vfio_iommu_noiommu_dma_map(int container_fd);
> +
> +/* IOMMU types we support */
> +static const struct vfio_iommu_type iommu_types[] = {
> + /* x86 IOMMU, otherwise known as type 1 */
> + { VFIO_TYPE1_IOMMU, "Type 1", _iommu_type1_dma_map},
> + /* IOMMU-less mode */
> + { VFIO_NOIOMMU_IOMMU, "No-IOMMU", _iommu_noiommu_dma_map},
> +};

[...]
> --- /dev/null
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c

Why a new file for these functions?



[dpdk-dev] [RFC PATCH 5/5] virtio: Extend virtio-net PMD to support container environment

2016-01-27 Thread Xie, Huawei
On 1/21/2016 7:09 PM, Tetsuya Mukawa wrote:
> + /* Set BAR region */
> + for (i = 0; i < NB_BAR; i++) {
> + switch (dev->bar[i].type) {
> + case QTEST_PCI_BAR_IO:
> + case QTEST_PCI_BAR_MEMORY_UNDER_1MB:
> + case QTEST_PCI_BAR_MEMORY_32:
> + qtest_pci_outl(s, bus, device, 0, dev->bar[i].addr,
> + dev->bar[i].region_start);
> + PMD_DRV_LOG(INFO, "Set BAR of %s device: 0x%lx - 
> 0x%lx\n",
> + dev->name, dev->bar[i].region_start,
> + dev->bar[i].region_start + 
> dev->bar[i].region_size);
> + break;
> + case QTEST_PCI_BAR_MEMORY_64:
> + qtest_pci_outq(s, bus, device, 0, dev->bar[i].addr,
> + dev->bar[i].region_start);
> + PMD_DRV_LOG(INFO, "Set BAR of %s device: 0x%lx - 
> 0x%lx\n",
> + dev->name, dev->bar[i].region_start,
> + dev->bar[i].region_start + 
> dev->bar[i].region_size);
> + break;

Hasn't the bar resource already been allocated? Is it the app's
responsibility to allocate the bar resource in qtest mode? The app
couldn't have that knowledge.

> + case QTEST_PCI_BAR_DISABLE:
> + break;
> + }
> + }
> +



[dpdk-dev] [PATCH v2 2/2] i40evf: support interrupt based pf reset request

2016-01-27 Thread Jingjing Wu
Interrupt based request of PF reset from PF is supported by
enabling the adminq event process in VF driver.
Users can register a callback for this interrupt event to get
informed, when a PF reset request detected like:
  rte_eth_dev_callback_register(portid,
RTE_ETH_EVENT_INTR_RESET,
reset_event_callback,
arg);

Signed-off-by: Jingjing Wu 
---
 doc/guides/rel_notes/release_2_3.rst |   1 +
 drivers/net/i40e/i40e_ethdev_vf.c| 274 +++
 lib/librte_ether/rte_ethdev.h|   1 +
 3 files changed, 246 insertions(+), 30 deletions(-)

diff --git a/doc/guides/rel_notes/release_2_3.rst 
b/doc/guides/rel_notes/release_2_3.rst
index 99de186..73d5f76 100644
--- a/doc/guides/rel_notes/release_2_3.rst
+++ b/doc/guides/rel_notes/release_2_3.rst
@@ -4,6 +4,7 @@ DPDK Release 2.3
 New Features
 

+* **Added pf reset event reported in i40e vf PMD driver.

 Resolved Issues
 ---
diff --git a/drivers/net/i40e/i40e_ethdev_vf.c 
b/drivers/net/i40e/i40e_ethdev_vf.c
index 64e6957..1ffe64e 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -74,8 +74,6 @@
 #define I40EVF_BUSY_WAIT_DELAY 10
 #define I40EVF_BUSY_WAIT_COUNT 50
 #define MAX_RESET_WAIT_CNT 20
-/*ITR index for NOITR*/
-#define I40E_QINT_RQCTL_MSIX_INDX_NOITR 3

 struct i40evf_arq_msg_info {
enum i40e_virtchnl_ops ops;
@@ -151,6 +149,9 @@ static int
 i40evf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id);
 static int
 i40evf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id);
+static void i40evf_handle_pf_event(__rte_unused struct rte_eth_dev *dev,
+  uint8_t *msg,
+  uint16_t msglen);

 /* Default hash key buffer for RSS */
 static uint32_t rss_key_default[I40E_VFQF_HKEY_MAX_INDEX + 1];
@@ -357,20 +358,42 @@ i40evf_execute_vf_cmd(struct rte_eth_dev *dev, struct 
vf_cmd_info *args)
return err;
}

-   do {
-   /* Delay some time first */
-   rte_delay_ms(ASQ_DELAY_MS);
-   ret = i40evf_read_pfmsg(dev, );
-   if (ret == I40EVF_MSG_CMD) {
-   err = 0;
-   break;
-   } else if (ret == I40EVF_MSG_ERR) {
-   err = -1;
-   break;
-   }
-   /* If don't read msg or read sys event, continue */
-   } while (i++ < MAX_TRY_TIMES);
-   _clear_cmd(vf);
+   switch (args->ops) {
+   case I40E_VIRTCHNL_OP_RESET_VF:
+   /*no need to process in this function */
+   break;
+   case I40E_VIRTCHNL_OP_VERSION:
+   case I40E_VIRTCHNL_OP_GET_VF_RESOURCES:
+   /* for init adminq commands, need to poll the response */
+   do {
+   /* Delay some time first */
+   rte_delay_ms(ASQ_DELAY_MS);
+   ret = i40evf_read_pfmsg(dev, );
+   if (ret == I40EVF_MSG_CMD) {
+   err = 0;
+   break;
+   } else if (ret == I40EVF_MSG_ERR) {
+   err = -1;
+   break;
+   }
+   /* If don't read msg or read sys event, continue */
+   } while (i++ < MAX_TRY_TIMES);
+   _clear_cmd(vf);
+   break;
+
+   default:
+   /* for other adminq in running time, waiting the cmd done flag 
*/
+   do {
+   /* Delay some time first */
+   rte_delay_ms(ASQ_DELAY_MS);
+   if (vf->pend_cmd == I40E_VIRTCHNL_OP_UNKNOWN) {
+   err = 0;
+   break;
+   }
+   /* If don't read msg or read sys event, continue */
+   } while (i++ < MAX_TRY_TIMES);
+   break;
+   }

return (err | vf->cmd_retval);
 }
@@ -719,7 +742,7 @@ i40evf_config_irq_map(struct rte_eth_dev *dev)

map_info = (struct i40e_virtchnl_irq_map_info *)cmd_buffer;
map_info->num_vectors = 1;
-   map_info->vecmap[0].rxitr_idx = I40E_QINT_RQCTL_MSIX_INDX_NOITR;
+   map_info->vecmap[0].rxitr_idx = I40E_ITR_INDEX_DEFAULT;
map_info->vecmap[0].vsi_id = vf->vsi_res->vsi_id;
/* Alway use default dynamic MSIX interrupt */
map_info->vecmap[0].vector_id = vector_id;
@@ -1093,6 +1116,38 @@ i40evf_dev_atomic_write_link_status(struct rte_eth_dev 
*dev,
return 0;
 }

+/* Disable IRQ0 */
+static inline void
+i40evf_disable_irq0(struct i40e_hw *hw)
+{
+   /* Disable all interrupt types */
+   I40E_WRITE_REG(hw, I40E_VFINT_ICR0_ENA1, 0);
+   I40E_WRITE_REG(hw, I40E_VFINT_DYN_CTL01,
+  

[dpdk-dev] [PATCH v2 1/2] i40evf: allocate virtchnl cmd buffer for each vf

2016-01-27 Thread Jingjing Wu
Currently, i40evf PMD uses a global static buffer to send virtchnl
command to host driver. It is shared by multi VFs.
This patch changed to allocate virtchnl cmd buffer for each VF.

Signed-off-by: Jingjing Wu 
---
 drivers/net/i40e/i40e_ethdev.h|   2 +
 drivers/net/i40e/i40e_ethdev_vf.c | 181 +++---
 2 files changed, 74 insertions(+), 109 deletions(-)

diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h
index 1f9792b..93122ad 100644
--- a/drivers/net/i40e/i40e_ethdev.h
+++ b/drivers/net/i40e/i40e_ethdev.h
@@ -494,7 +494,9 @@ struct i40e_vf {
bool link_up;
bool vf_reset;
volatile uint32_t pend_cmd; /* pending command not finished yet */
+   uint32_t cmd_retval; /* return value of the cmd response from PF */
u16 pend_msg; /* flags indicates events from pf not handled yet */
+   uint8_t *aq_resp; /* buffer to store the adminq response from PF */

/* VSI info */
struct i40e_virtchnl_vf_resource *vf_res; /* All VSIs */
diff --git a/drivers/net/i40e/i40e_ethdev_vf.c 
b/drivers/net/i40e/i40e_ethdev_vf.c
index 14d2a50..64e6957 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -103,9 +103,6 @@ enum i40evf_aq_result {
I40EVF_MSG_CMD,  /* Read async command result */
 };

-/* A share buffer to store the command result from PF driver */
-static uint8_t cmd_result_buffer[I40E_AQ_BUF_SZ];
-
 static int i40evf_dev_configure(struct rte_eth_dev *dev);
 static int i40evf_dev_start(struct rte_eth_dev *dev);
 static void i40evf_dev_stop(struct rte_eth_dev *dev);
@@ -237,31 +234,39 @@ i40evf_set_mac_type(struct i40e_hw *hw)
 }

 /*
- * Parse admin queue message.
- *
- * return value:
- *  < 0: meet error
- *  0: read sys msg
- *  > 0: read cmd result
+ * Read data in admin queue to get msg from pf driver
  */
 static enum i40evf_aq_result
-i40evf_parse_pfmsg(struct i40e_vf *vf,
-  struct i40e_arq_event_info *event,
-  struct i40evf_arq_msg_info *data)
+i40evf_read_pfmsg(struct rte_eth_dev *dev, struct i40evf_arq_msg_info *data)
 {
-   enum i40e_virtchnl_ops opcode = (enum i40e_virtchnl_ops)\
-   rte_le_to_cpu_32(event->desc.cookie_high);
-   enum i40e_status_code retval = (enum i40e_status_code)\
-   rte_le_to_cpu_32(event->desc.cookie_low);
-   enum i40evf_aq_result ret = I40EVF_MSG_CMD;
+   struct i40e_hw *hw = I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+   struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+   struct i40e_arq_event_info event;
+   enum i40e_virtchnl_ops opcode;
+   enum i40e_status_code retval;
+   int ret;
+   enum i40evf_aq_result result = I40EVF_MSG_NON;

+   event.buf_len = data->buf_len;
+   event.msg_buf = data->msg;
+   ret = i40e_clean_arq_element(hw, , NULL);
+   /* Can't read any msg from adminQ */
+   if (ret) {
+   if (ret == I40E_ERR_ADMIN_QUEUE_NO_WORK)
+   result = I40EVF_MSG_NON;
+   else
+   result = I40EVF_MSG_ERR;
+   return result;
+   }
+
+   opcode = (enum 
i40e_virtchnl_ops)rte_le_to_cpu_32(event.desc.cookie_high);
+   retval = (enum i40e_status_code)rte_le_to_cpu_32(event.desc.cookie_low);
/* pf sys event */
if (opcode == I40E_VIRTCHNL_OP_EVENT) {
struct i40e_virtchnl_pf_event *vpe =
-   (struct i40e_virtchnl_pf_event *)event->msg_buf;
+   (struct i40e_virtchnl_pf_event *)event.msg_buf;

-   /* Initialize ret to sys event */
-   ret = I40EVF_MSG_SYS;
+   result = I40EVF_MSG_SYS;
switch (vpe->event) {
case I40E_VIRTCHNL_EVENT_LINK_CHANGE:
vf->link_up =
@@ -286,74 +291,17 @@ i40evf_parse_pfmsg(struct i40e_vf *vf,
}
} else {
/* async reply msg on command issued by vf previously */
-   ret = I40EVF_MSG_CMD;
+   result = I40EVF_MSG_CMD;
/* Actual data length read from PF */
-   data->msg_len = event->msg_len;
+   data->msg_len = event.msg_len;
}
-   /* fill the ops and result to notify VF */
+
data->result = retval;
data->ops = opcode;

-   return ret;
-}
-
-/*
- * Read data in admin queue to get msg from pf driver
- */
-static enum i40evf_aq_result
-i40evf_read_pfmsg(struct rte_eth_dev *dev, struct i40evf_arq_msg_info *data)
-{
-   struct i40e_hw *hw = I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-   struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
-   struct i40e_arq_event_info event;
-   int ret;
-   enum i40evf_aq_result result = I40EVF_MSG_NON;
-
-   event.buf_len = data->buf_len;
-   event.msg_buf = data->msg;
-   ret = 

  1   2   >