[dpdk-dev] [PATCH v3 4/4] virtio: check if kernel driver is manipulating the virtio device
v3 changes: change log message to tell user that the virtio device is skipped due to it is managed by kernel driver, instead of asking user to unbind it from kernel driver. v2 changes: change LOG level from ERR to INFO virtio PMD could use IO port to configure the virtio device without using uio driver(vfio-noniommu mode should work as well). There are two issues with previous implementation: 1) virtio PMD will take over each virtio device blindly even if some are not intended for DPDK. 2) driver conflict between virtio PMD and virtio-net kernel driver. This patch checks if there is any kernel driver manipulating the virtio device before virtio PMD uses IO port to configure the device. Fixes: da978dfdc43b ("virtio: use port IO to get PCI resource") Signed-off-by: Huawei Xie --- drivers/net/virtio/virtio_ethdev.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c index e815acd..ea1874a 100644 --- a/drivers/net/virtio/virtio_ethdev.c +++ b/drivers/net/virtio/virtio_ethdev.c @@ -1138,6 +1138,11 @@ static int virtio_resource_init_by_ioports(struct rte_pci_device *pci_dev) int found = 0; size_t linesz; + if (pci_dev->kdrv != RTE_KDRV_NONE) { + PMD_INIT_LOG(INFO, "skip kernel managed virtio device."); + return -1; + } + snprintf(pci_id, sizeof(pci_id), PCI_PRI_FMT, pci_dev->addr.domain, pci_dev->addr.bus, -- 1.8.1.4
[dpdk-dev] [PATCH v3 3/4] virtio: return 1 to tell the upper layer we don't take over this device
v2 changes: Remove unnecessary assignment of NULL to dev->data->mac_addrs Ajust one comment's position if virtio_resource_init fails, cleanup the resource and return 1 to tell the upper layer we don't take over this device. -1 means error which will cause DPDK to exit. Signed-off-by: Huawei Xie --- drivers/net/virtio/virtio_ethdev.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c index d928339..e815acd 100644 --- a/drivers/net/virtio/virtio_ethdev.c +++ b/drivers/net/virtio/virtio_ethdev.c @@ -1287,8 +1287,13 @@ eth_virtio_dev_init(struct rte_eth_dev *eth_dev) pci_dev = eth_dev->pci_dev; - if (virtio_resource_init(pci_dev) < 0) - return -1; + if (virtio_resource_init(pci_dev) < 0) { + rte_free(eth_dev->data->mac_addrs); + /* Return 1 to tell the upper layer we don't take over +* this device. +*/ + return 1; + } hw->use_msix = virtio_has_msix(_dev->addr); hw->io_base = (uint32_t)(uintptr_t)pci_dev->mem_resource[0].addr; -- 1.8.1.4
[dpdk-dev] [PATCH v3 2/4] eal: set kdrv to RTE_KDRV_NONE if kernel driver isn't manipulating the device.
Use RTE_KDRV_NONE to indicate that kernel driver isn't manipulating the device. Signed-off-by: Huawei Xie Acked-by: David Marchand --- lib/librte_eal/linuxapp/eal/eal_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c index bc5b5be..640b190 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci.c @@ -362,7 +362,7 @@ pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus, else dev->kdrv = RTE_KDRV_UNKNOWN; } else - dev->kdrv = RTE_KDRV_UNKNOWN; + dev->kdrv = RTE_KDRV_NONE; /* device is valid, add in list (sorted) */ if (TAILQ_EMPTY(_device_list)) { -- 1.8.1.4
[dpdk-dev] [PATCH v3 1/4] eal: make the comment more accurate
positive return of rte_eal_pci_probe_one_driver means the driver doesn't support the device. Signed-off-by: Huawei Xie --- lib/librte_eal/common/eal_common_pci.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/librte_eal/common/eal_common_pci.c b/lib/librte_eal/common/eal_common_pci.c index dcfe947..bbcdb2b 100644 --- a/lib/librte_eal/common/eal_common_pci.c +++ b/lib/librte_eal/common/eal_common_pci.c @@ -204,7 +204,7 @@ rte_eal_pci_probe_one_driver(struct rte_pci_driver *dr, struct rte_pci_device *d /* call the driver devinit() function */ return dr->devinit(dr, dev); } - /* return positive value if driver is not found */ + /* return positive value if driver doesn't support this device */ return 1; } @@ -259,7 +259,7 @@ rte_eal_pci_detach_dev(struct rte_pci_driver *dr, return 0; } - /* return positive value if driver is not found */ + /* return positive value if driver doesn't support this device */ return 1; } @@ -283,7 +283,7 @@ pci_probe_all_drivers(struct rte_pci_device *dev) /* negative value is an error */ return -1; if (rc > 0) - /* positive value means driver not found */ + /* positive value means driver doesn't support it */ continue; return 0; } @@ -310,7 +310,7 @@ pci_detach_all_drivers(struct rte_pci_device *dev) /* negative value is an error */ return -1; if (rc > 0) - /* positive value means driver not found */ + /* positive value means driver doesn't support it */ continue; return 0; } -- 1.8.1.4
[dpdk-dev] [PATCH v3 0/4] fix the issue that DPDK takes over virtio device blindly
v3 changes: change log message to tell user that the virtio device is skipped due to it is managed by kernel driver, instead of asking user to unbind it from kernel driver. v2 changes: Remove unnecessary assignment of NULL to dev->data->mac_addrs Ajust one comment's position change LOG level from ERR to INFO virtio PMD doesn't set RTE_PCI_DRV_NEED_MAPPING in drv_flags of its eth_driver. It will try igb_uio and PORT IO in turn to configure virtio device. Even user in guest VM doesn't want to use virtio for DPDK, virtio PMD will take over the device blindly. The more serious problem is kernel driver is still manipulating the device, which causes driver conflict. This patch checks if there is any kernel driver manipulating the virtio device before virtio PMD uses port IO to configure the device. Huawei Xie (4): eal: make the comment more accurate eal: set kdrv to RTE_KDRV_NONE if kernel driver isn't manipulating the device. virtio: return 1 to tell the kernel we don't take over this device virtio: check if kernel driver is manipulating the virtio device drivers/net/virtio/virtio_ethdev.c | 14 -- lib/librte_eal/common/eal_common_pci.c | 8 lib/librte_eal/linuxapp/eal/eal_pci.c | 2 +- 3 files changed, 17 insertions(+), 7 deletions(-) -- 1.8.1.4
[dpdk-dev] [dpdk-dev,v2] Clean up rte_memcpy.h file
> Remove unnecessary type casting in functions. > > Tested on Ubuntu (14.04 x86_64) with "make test". > "make test" results match the results with baseline. > "Memcpy perf" results match the results with baseline. > > Signed-off-by: Ravi Kerur > Acked-by: Stephen Hemminger > > --- > .../common/include/arch/x86/rte_memcpy.h | 340 +++-- > 1 file changed, 175 insertions(+), 165 deletions(-) > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > index 6a57426..839d4ec 100644 > --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h [...] > /** > @@ -150,13 +150,16 @@ rte_mov64blocks(uint8_t *dst, const uint8_t *src, > size_t n) > __m256i ymm0, ymm1; > > while (n >= 64) { > - ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 0 * 32)); > + > + ymm0 = _mm256_loadu_si256((const __m256i *)(src + 0 * 32)); > + ymm1 = _mm256_loadu_si256((const __m256i *)(src + 1 * 32)); > + > + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), ymm0); > + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), ymm1); > + Any particular reason to change the order of the statements here? :) Overall this patch looks good. > n -= 64; > - ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 1 * 32)); > - src = (const uint8_t *)src + 64; > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); > - dst = (uint8_t *)dst + 64; > + src = src + 64; > + dst = dst + 64; > } > } >
[dpdk-dev] [PATCH v6 08/10] virtio_pci: do not parse if interface is vfio
If virtio interface attached to vfio-noiommu driver then do not parse for virtio resource. Instead exit with return 0; Note: Applicable for virtio spec 0.95. Signed-off-by: Santosh Shukla --- v5-->v6: - Replaced pci_dev->kdrv check from __noiommu to default; This is because patch [1] in v5 series not required. [1] http://dpdk.org/dev/patchwork/patch/9984/ drivers/net/virtio/virtio_pci.c |4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio/virtio_pci.c b/drivers/net/virtio/virtio_pci.c index 0c29f1d..deec306 100644 --- a/drivers/net/virtio/virtio_pci.c +++ b/drivers/net/virtio/virtio_pci.c @@ -514,7 +514,9 @@ virtio_resource_init_by_ioports(struct rte_pci_device *pci_dev) static int legacy_virtio_resource_init(struct rte_pci_device *pci_dev) { - if (virtio_resource_init_by_uio(pci_dev) == 0) + if (pci_dev->kdrv == RTE_KDRV_VFIO) + return 0; + else if (virtio_resource_init_by_uio(pci_dev) == 0) return 0; else return virtio_resource_init_by_ioports(pci_dev); -- 1.7.9.5
[dpdk-dev] [PATCH v6 08/11] eal: pci: introduce RTE_KDRV_VFIO_NOIOMMUi driver mode
On Wed, Jan 27, 2016 at 9:26 PM, Santosh Shukla wrote: > On Wed, Jan 27, 2016 at 9:09 PM, Thomas Monjalon > wrote: >> 2016-01-27 21:02, Santosh Shukla: >>> 1. virtio currently works for vfio+noiommu and likely will work for >>> vfio+iommu in near future. >>> 2. So remove __noiommu suffix and always use default. >>> 3. Introduce vfio resource parsing global function, That function >>> suppose to do parsing for default vfio case and for vfio-noiommu case. >>> This function will be used by pmd drivers for resource parsing purpose >>> example virtio. >>> >>> Yuan won't be happy with 3) I guess, because he wanted to get rid of >>> interface parsing from pmd driver. >>> >>> Thomas, if 1/2/3/ addresses your concern then I'll spin the series, >> >> I agree with 1/ and 2/. >> Please, could you explain why 3/ is needed? > > Because someone should do resource parsing / validation before driver > does resource mapping/initialization. That someone could be either EAL > layer or driver itself. > > In my case; > - driver is virtio > - resource is vfio interface FWIW, removed 3) / Removed this patch entirely from this series, Sending v6 version for effected patch [09/11]..
[dpdk-dev] [dpdk-dev, v3] Implement memcmp using Intel SIMD instrinsics.
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcmp.h b/lib > /librte_eal/common/include/arch/x86/rte_memcmp.h [...] > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +/** > + * Compare bytes between two locations. The locations must not overlap. > + * Parameter names should be kept consistent as they are in function body. > + * @param src_1 > + * Pointer to the first source of the data. > + * @param src_2 > + * Pointer to the second source of the data. > + * @param n > + * Number of bytes to compare. > + * @return > + * zero if src_1 equal src_2 > + * -ve if src_1 less than src_2 > + * +ve if src_1 greater than src_2 > + */ > +static inline int > +rte_memcmp(const void *src_1, const void *src, > + size_t n) __attribute__((always_inline)); > + > +/** > + * Find the first different bit for comparison. > + */ > +static inline int > +rte_cmpffd (uint32_t x, uint32_t y) > +{ > + int i; > + int pos = x ^ y; > + for (i = 0; i < 32; i++) > + if (pos & (1< + return i; > + return -1; > +} > + [...] > +/** > + * Compare 48 bytes between two locations. > + * Locations should not overlap. > + */ > +static inline int > +rte_cmp48(const void *src_1, const void *src_2) Guess this is not used. [...] > +/** > + * Compare 256 bytes between two locations. > + * Locations should not overlap. > + */ > +static inline int > +rte_cmp256(const void *src_1, const void *src_2) > +{ > + int ret; > + > + ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64, > + (const uint8_t *)src_2 + 0 * 64); Why not just use rte_cmp128? [...] > +static inline int > +rte_memcmp(const void *_src_1, const void *_src_2, size_t n) > +{ > + const uint8_t *src_1 = (const uint8_t *)_src_1; > + const uint8_t *src_2 = (const uint8_t *)_src_2; > + int ret = 0; > + > + if (n < 16) > + return rte_memcmp_regular(src_1, src_2, n); > + > + if (n <= 32) { > + ret = rte_cmp16(src_1, src_2); > + if (unlikely(ret != 0)) > + return ret; > + > + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n); > + } > + Too many conditions here may harm the overall performance. It's a trade-off thing, all about balancing the overhead. Just make sure this is tuned based on actual test numbers. > + if (n <= 48) { > + ret = rte_cmp32(src_1, src_2); > + if (unlikely(ret != 0)) > + return ret; > + > + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n); > + } > + > + if (n <= 64) { > + ret = rte_cmp32(src_1, src_2); > + if (unlikely(ret != 0)) > + return ret; > + > + ret = rte_cmp16(src_1 + 32, src_2 + 32); > + > + if (unlikely(ret != 0)) > + return ret; > + > + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n); > + } > + > + if (n <= 96) { > + ret = rte_cmp64(src_1, src_2); > + if (unlikely(ret != 0)) > + return ret; > + > + ret = rte_cmp16(src_1 + 64, src_2 + 64); > + if (unlikely(ret != 0)) > + return ret; > + > + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n); > + } > + > + if (n <= 128) { > + ret = rte_cmp64(src_1, src_2); > + if (unlikely(ret != 0)) > + return ret; > + > + ret = rte_cmp32(src_1 + 64, src_2 + 64); > + if (unlikely(ret != 0)) > + return ret; > + > + ret = rte_cmp16(src_1 + 96, src_2 + 96); > + if (unlikely(ret != 0)) > + return ret; > + > + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n); > + } [...] > +/** > + * Compare 48 bytes between two locations. > + * Locations should not overlap. > + */ > +static inline int > +rte_cmp48(const void *src_1, const void *src_2) Not used. > +{ > + int ret; > + > + ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16, > + (const uint8_t *)src_2 + 0 * 16); > + > + if (unlikely(ret != 0)) > + return ret; > + > + ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16, > + (const uint8_t *)src_2 + 1 * 16); > + > + if (unlikely(ret != 0)) > + return ret; > + > + return rte_cmp16((const uint8_t *)src_1 + 2 * 16, > + (const uint8_t *)src_2 + 2 * 16); > +} > + > +/** > + * Compare 64 bytes between two locations. > + * Locations should not overlap. > + */ > +static inline int > +rte_cmp64(const void *src_1, const void *src_2) > +{ > + int ret; > + > + ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16, > + (const uint8_t *)src_2 + 0 * 16); Why not rte_cmp32? And use rte_cmp64 for rte_cmp128, and so
[dpdk-dev] [PATCH v3] remove extra parentheses in return statement
v3 changes: remove other extra parentheses in 'return (logical expressions)' which checkpatch doesn't report as error remove extra parentheses in return statement which crosses multiple line fix the document v2 changes: add missed commit message in v1 fix the error reported by checkpatch: "ERROR: return is not a function, parentheses are not required" remove parentheses in return like: "return (logical expressions)" remove parentheses in return a function like: "return (rte_mempool_lookup(...))" Fixes: 6307b909b8e0 ("lib: remove extra parenthesis after return") Signed-off-by: Huawei Xie --- app/test-pmd/cmdline.c | 12 ++-- app/test-pmd/config.c | 2 +- app/test-pmd/flowgen.c | 2 +- app/test-pmd/mempool_anon.c| 12 ++-- app/test-pmd/testpmd.h | 2 +- app/test-pmd/txonly.c | 2 +- app/test/test_kni.c| 2 +- app/test/test_mbuf.c | 12 ++-- app/test/test_memcpy_perf.c| 4 +- app/test/test_mempool.c| 4 +- app/test/test_memzone.c| 24 +++ app/test/test_red.c| 42 ++-- app/test/test_ring.c | 4 +- doc/guides/sample_app_ug/ipv4_multicast.rst| 8 +-- drivers/crypto/aesni_mb/rte_aesni_mb_pmd_ops.c | 2 +- drivers/crypto/qat/qat_crypto.c| 4 +- drivers/crypto/qat/qat_qp.c| 22 +++--- drivers/net/bnx2x/bnx2x.c | 34 - drivers/net/bnx2x/bnx2x.h | 4 +- drivers/net/bnx2x/bnx2x_rxtx.c | 16 ++--- drivers/net/bnx2x/debug.c | 6 +- drivers/net/bnx2x/elink.c | 2 +- drivers/net/bonding/rte_eth_bond_pmd.c | 2 +- drivers/net/cxgbe/cxgbe_main.c | 2 +- drivers/net/e1000/em_ethdev.c | 40 +-- drivers/net/e1000/em_rxtx.c| 46 ++--- drivers/net/e1000/igb_ethdev.c | 22 +++--- drivers/net/e1000/igb_rxtx.c | 30 drivers/net/enic/enic_clsf.c | 2 +- drivers/net/fm10k/fm10k_ethdev.c | 40 +-- drivers/net/i40e/i40e_ethdev.c | 2 +- drivers/net/i40e/i40e_ethdev.h | 2 +- drivers/net/i40e/i40e_ethdev_vf.c | 2 +- drivers/net/i40e/i40e_rxtx.c | 14 ++-- drivers/net/ixgbe/ixgbe_82599_bypass.c | 4 +- drivers/net/ixgbe/ixgbe_bypass.c | 2 +- drivers/net/ixgbe/ixgbe_ethdev.c | 34 - drivers/net/ixgbe/ixgbe_rxtx.c | 36 +- drivers/net/mlx5/mlx5_rxq.c| 2 +- drivers/net/mlx5/mlx5_utils.h | 2 +- drivers/net/mpipe/mpipe_tilegx.c | 4 +- drivers/net/nfp/nfp_net.c | 16 ++--- drivers/net/virtio/virtio_ethdev.c | 6 +- drivers/net/vmxnet3/vmxnet3_ring.h | 2 +- drivers/net/xenvirt/virtqueue.h| 2 +- examples/ip_pipeline/cpu_core_map.c| 2 +- .../pipeline/pipeline_flow_actions_be.c| 2 +- examples/ip_reassembly/main.c | 22 +++--- examples/ipv4_multicast/main.c | 14 ++-- examples/l3fwd/main.c | 4 +- .../client_server_mp/mp_server/init.c | 2 +- examples/multi_process/symmetric_mp/main.c | 2 +- examples/netmap_compat/bridge/bridge.c | 8 +-- examples/netmap_compat/lib/compat_netmap.c | 80 +++--- examples/performance-thread/common/lthread_queue.h | 2 +- examples/performance-thread/common/lthread_sched.c | 4 +- examples/qos_sched/args.c | 2 +- examples/quota_watermark/qw/main.h | 2 +- examples/vhost/main.c | 4 +- examples/vhost_xen/main.c | 4 +- examples/vhost_xen/vhost_monitor.c | 6 +- lib/librte_acl/acl_bld.c | 4 +- lib/librte_acl/acl_run_neon.h | 2 +- lib/librte_cfgfile/rte_cfgfile.c | 4 +- lib/librte_cryptodev/rte_cryptodev.c | 24 +++ lib/librte_eal/bsdapp/eal/eal_lcore.c | 2 +- lib/librte_eal/common/eal_common_memzone.c | 2 +- .../common/include/arch/ppc_64/rte_atomic.h| 12 ++-- .../common/include/arch/ppc_64/rte_byteorder.h | 10 +-- .../common/include/arch/ppc_64/rte_spinlock.h | 2 +- .../common/include/arch/x86/rte_atomic.h |
[dpdk-dev] DPDK OVS on Ubuntu 14.04# Issue's Resolved# Inter-VM communication & IP allocation through DHCP issue
Hi Przemek, Thanks for the quick response. Now able to get the DHCP ip's for 2 vhostuser instances and able to ping each other. Isssue was a bug in cirros 0.3.0 images which we were using in openstack after using 0.3.1 image as given in the URL( https://www.redhat.com/archives/rhos-list/2013-August/msg00032.html), able to get the IP's in vhostuser VM instances. As per our understanding, Packet flow across DPDK datapath will be like vhostuser ports are connected to the br-int bridge & same is being patched to the br-dpdk bridge where in our physical network (NIC) is connected with dpdk0 port. So for testing the flow we have to connect that physical network(NIC) with external packet generator (e.g - ixia, iperf) & run the testpmd application in the vhostuser VM, right? Does it required to add any flows/efforts in bridge configurations(either br-int or br-dpdk)? Thanks & Regards Abhijeet Karve From: "Czesnowicz, Przemyslaw"To: Abhijeet Karve Cc: "dev at dpdk.org" , "discuss at openvswitch.org" , "Gray, Mark D" Date: 01/27/2016 05:11 PM Subject:RE: [dpdk-dev] DPDK OVS on Ubuntu 14.04# Issue's Resolved# Inter-VM communication & IP allocation through DHCP issue Hi Abhijeet, It seems you are almost there! When booting the VM?s do you request hugepage memory for them (by setting hw:mem_page_size=large in flavor extra_spec)? If not then please do, if yes then please look into libvirt logfiles for the VM?s (in /var/log/libvirt/qemu/instance-xxx), I think there could be a clue. Regards Przemek From: Abhijeet Karve [mailto:abhijeet.ka...@tcs.com] Sent: Monday, January 25, 2016 6:13 PM To: Czesnowicz, Przemyslaw Cc: dev at dpdk.org; discuss at openvswitch.org; Gray, Mark D Subject: RE: [dpdk-dev] DPDK OVS on Ubuntu 14.04# Issue's Resolved# Inter-VM communication & IP allocation through DHCP issue Hi Przemek, Thank you for your response, It really provided us breakthrough. After setting up DPDK on compute node for stable/kilo, We are trying to set up Openstack stable/liberty all-in-one setup, At present we are not able to get the IP allocation for the vhost type instances through DHCP. Also we tried assigning IP's manually to them but the inter-VM communication also not happening, #neutron agent-list root at nfv-dpdk-devstack:/etc/neutron# neutron agent-list +--++---+---++---+ | id | agent_type | host | alive | admin_state_up | binary| +--++---+---++---+ | 3b29e93c-3a25-4f7d-bf6c-6bb309db5ec0 | DPDK OVS Agent | nfv-dpdk-devstack | :-) | True | neutron-openvswitch-agent | | 62593b2c-c10f-4d93-8551-c46ce24895a6 | L3 agent | nfv-dpdk-devstack | :-) | True | neutron-l3-agent | | 7cb97af9-cc20-41f8-90fb-aba97d39dfbd | DHCP agent | nfv-dpdk-devstack | :-) | True | neutron-dhcp-agent| | b613c654-99b7-437e-9317-20fa651a1310 | Linux bridge agent | nfv-dpdk-devstack | :-) | True | neutron-linuxbridge-agent | | c2dd0384-6517-4b44-9c25-0d2825d23f57 | Metadata agent | nfv-dpdk-devstack | :-) | True | neutron-metadata-agent| | f23dde40-7dc0-4f20-8b3e-eb90ddb15e49 | Open vSwitch agent | nfv-dpdk-devstack | xxx | True | neutron-openvswitch-agent | +--++---+---++---+ ovs-vsctl show output# Bridge br-dpdk Port br-dpdk Interface br-dpdk type: internal Port phy-br-dpdk Interface phy-br-dpdk type: patch options: {peer=int-br-dpdk} Bridge br-int fail_mode: secure Port "vhufa41e799-f2" tag: 5 Interface "vhufa41e799-f2" type: dpdkvhostuser Port int-br-dpdk Interface int-br-dpdk type: patch options: {peer=phy-br-dpdk} Port "tap4e19f8e1-59" tag: 5 Interface "tap4e19f8e1-59" type: internal Port "vhu05734c49-3b" tag: 5 Interface "vhu05734c49-3b" type: dpdkvhostuser Port "vhu10c06b4d-84" tag: 5 Interface "vhu10c06b4d-84" type: dpdkvhostuser Port patch-tun Interface patch-tun type: patch options: {peer=patch-int} Port "vhue169c581-ef" tag: 5 Interface "vhue169c581-ef"
[dpdk-dev] [PATCH v6 08/11] eal: pci: introduce RTE_KDRV_VFIO_NOIOMMUi driver mode
On Wed, Jan 27, 2016 at 9:09 PM, Thomas Monjalon wrote: > 2016-01-27 21:02, Santosh Shukla: >> 1. virtio currently works for vfio+noiommu and likely will work for >> vfio+iommu in near future. >> 2. So remove __noiommu suffix and always use default. >> 3. Introduce vfio resource parsing global function, That function >> suppose to do parsing for default vfio case and for vfio-noiommu case. >> This function will be used by pmd drivers for resource parsing purpose >> example virtio. >> >> Yuan won't be happy with 3) I guess, because he wanted to get rid of >> interface parsing from pmd driver. >> >> Thomas, if 1/2/3/ addresses your concern then I'll spin the series, > > I agree with 1/ and 2/. > Please, could you explain why 3/ is needed? Because someone should do resource parsing / validation before driver does resource mapping/initialization. That someone could be either EAL layer or driver itself. In my case; - driver is virtio - resource is vfio interface
[dpdk-dev] [PATCH v2 0/5] Optimize memcpy for AVX512 platforms
2016-01-27 18:48, Ananyev, Konstantin: > From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com] > > > > > Zhihong Wang (5): > > > lib/librte_eal: Identify AVX512 CPU flag > > > mk: Predefine AVX512 macro for compiler > > > lib/librte_eal: Optimize memcpy for AVX512 platforms > > > app/test: Adjust alignment unit for memcpy perf test > > > lib/librte_eal: Tune memcpy for prior platforms > > > > > > app/test/test_memcpy_perf.c| 6 + > > > .../common/include/arch/x86/rte_cpuflags.h | 2 + > > > .../common/include/arch/x86/rte_memcpy.h | 269 > > > - > > > mk/rte.cpuflags.mk | 4 + > > > 4 files changed, 268 insertions(+), 13 deletions(-) > > > > The maintainers of arch/x86 are Bruce and Konstantin. > > I guess there is no comment and we can apply this cool series? > > Yes, looks ok to me. Applied, thanks Some benchmark feedbacks would be welcome.
[dpdk-dev] [PATCH v2] doc: introduce networking driver matrix
In order to better compare the drivers and check what is missing for a common baseline, we need to fill a matrix. A CSS trick is used to fit the HTML page. The PDF output needs some LaTeX wizardry. Signed-off-by: Thomas Monjalon --- v2: add vector PMDs --- doc/guides/nics/index.rst| 1 + doc/guides/nics/overview.rst | 147 +++ 2 files changed, 148 insertions(+) create mode 100644 doc/guides/nics/overview.rst diff --git a/doc/guides/nics/index.rst b/doc/guides/nics/index.rst index 33c9cea..8618114 100644 --- a/doc/guides/nics/index.rst +++ b/doc/guides/nics/index.rst @@ -35,6 +35,7 @@ Network Interface Controller Drivers :maxdepth: 3 :numbered: +overview bnx2x cxgbe e1000em diff --git a/doc/guides/nics/overview.rst b/doc/guides/nics/overview.rst new file mode 100644 index 000..d4c6ff4 --- /dev/null +++ b/doc/guides/nics/overview.rst @@ -0,0 +1,147 @@ +.. BSD LICENSE +Copyright 2016 6WIND S.A. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +* Neither the name of 6WIND S.A. nor the names of its +contributors may be used to endorse or promote products derived +from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Overview of Networking Drivers +== + +The networking drivers may be classified in two categories: + +- physical for real devices +- virtual for emulated devices + +Some physical devices may be shaped through a virtual layer as for +SR-IOV. +The interface seen in the virtual environment is a VF (Virtual Function). + +The ethdev layer exposes an API to use the networking functions +of these devices. +The bottom half part of ethdev is implemented by the drivers. +Thus some features may not be implemented. + +There are more differences between drivers regarding some internal properties, +portability or even documentation availability. +Most of these differences are summarized below. + +.. _table_net_pmd_features: + +.. raw:: html + + + table#id1 th { + font-size: 80%; + white-space: pre-wrap; + text-align: center; + vertical-align: top; + padding: 3px; + } + table#id1 th:first-child { + vertical-align: bottom; + } + table#id1 td { + font-size: 70%; + padding: 1px; + } + table#id1 td:first-child { + padding-left: 1em; + } + + +.. table:: Features availability in networking drivers + + = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = + Feature a b b b c e e i i i i i i i i i i f f m m m n n p r s v v v x +f n n o x 1 n 4 4 4 4 g g x x x x m m l l p f u c i z i i m e +p x x n g 0 i 0 0 0 0 b b g g g g 1 1 x x i p l a n e r r x n +a 2 2 d b 0 c e e e e v b b b b 0 0 4 5 p l p g d t t n v +c x x i e 0 . v v f e e e e k k e a i i e i +k v n . f f . v v . t o o t r +e f g . . . f f . a . 3 t +t v v v v v 2 v +e e e e e e +c c c c c c + = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = + link status + link status event + Rx interrupt + queue start/stop + MTU update + jumbo frame + scattered Rx + LRO + TSO + promiscuous mode + allmulticast mode
[dpdk-dev] [PATCH v6 08/11] eal: pci: introduce RTE_KDRV_VFIO_NOIOMMUi driver mode
On Wed, Jan 27, 2016 at 4:11 PM, Santosh Shukla wrote: > On Tue, Jan 26, 2016 at 9:51 PM, Santosh Shukla wrote: >> On Tue, Jan 26, 2016 at 7:58 PM, Thomas Monjalon >> wrote: >>> 2016-01-26 19:35, Santosh Shukla: On Tue, Jan 26, 2016 at 6:30 PM, Thomas Monjalon wrote: > 2016-01-26 15:56, Santosh Shukla: >> In my observation, currently virtio work for vfio-noiommu, that's why >> said drv->kdrv need to know vfio mode. > > It is your observation. It may change in near future. so that mean till then, virtio support for non-x86 arch has to wait? >>> >>> No, absolutely not. virtio for non-x86 is welcome. >>> We have working model with vfio-noiommu, don't you think it make sense to let vfio_noiommu implementation exist and later in-case virtio+iommu gets mainline then switch to vfio __mode__ agnostic approach. And for that All it takes to replace __noiommu suffix with default. >>> >>> I'm just saying you should not touch the enum rte_kernel_driver. >>> RTE_KDRV_VFIO is a driver. >>> RTE_KDRV_VFIO_NOIOMMU is a mode. >>> As the VFIO API is the same in both modes, there is no reason to >>> distinguish them at this level. >>> Your patch adds the NOIOMMU case everywhere: >>> case RTE_KDRV_VFIO: >>> + case RTE_KDRV_VFIO_NOIOMMU: >>> >>> I'll stop commenting here to let others give their opinion. >>> >>> [...] >> with vfio+iommu; binding virtio pci device to vfio-pci driver fail; >> giving below error: >> [ 53.053464] VFIO - User Level meta-driver version: 0.3 >> [ 73.077805] vfio-pci: probe of :00:03.0 failed with error -22 >> [ 73.077852] vfio-pci: probe of :00:03.0 failed with error -22 >> >> vfio_pci_probe() --> vfio_iommu_group_get() --> iommu_group_get() >> fails: iommu doesn't have group for virtio pci device. > > Yes it fails when binding. > So the later check in the virtio PMD is useless. Which check? >>> >>> The check for VFIO noiommu only: >>> - if (dev->kdrv == RTE_KDRV_VFIO) >>> + if (dev->kdrv == RTE_KDRV_VFIO_NOIOMMU) >>> >>> [...] > Furthermore restricting virtio to no-iommu mode doesn't bring > any improvement. We're not __restricting__, as soon as virtio+iommu gets working state, we'll simply replace __noiommu with default. Then its upto user to try out virtio with vfio default or vfio_noiommu. >>> >>> Yes it's up to user. >>> So your code should be >>> if (dev->kdrv == RTE_KDRV_VFIO) >>> >> >> Right, >> > That's why I suggest to keep the initial semantic of kdrv and > not pollute it with VFIO modes. I am okay to live with default and forget suffix __noiommu but there are implementation problem which was discussed in other thread - Virtio pmd driver should avoid interface parsing i.e. virtio_resource_init_uio/vfio() etc.. For vfio case - We could easily get rid of by moving /sys parsing to pci_eal layer, Right? If so then virtio currently works with vfio-noiommu, it make sense to me that pci_eal layer does parsing for pmd driver before that pmd driver get initialized. >>> >>> Please reword. What is the problem? >>> - Another case could be: iommu-less-pmd-driver. eal layer to do parsing before updating drv->kdrv. >>> >>> [...] >> >> > If a check is needed, I would prefer using your function >> >> > pci_vfio_is_noiommu() and remove driver modes from struct >> >> > rte_kernel_driver. >> >> >> >> I don't think calling pci_vfio_no_iommu() inside >> >> virtio_reg_rd/wr_1/2/3() would be a good idea. >> > >> > Why? The value may be cached in the priv properties. >> > >> pci_vfio_is_noiommu() parses /sys for >> - enable_noiommu param >> - attached driver name is vfio-noiommu or not. >> >> It does file operation for that, I meant to say that calling this api >> within register_rd/wr function is not correct. It would be better if >> those low level register_rd/wr api only checks driver_types. > > Yes, that's why I said the return of pci_vfio_is_noiommu() may be cached > to keep efficiency. I am not convinced though, Still find pmd driver checking driver_types using drv->kdrv is better approach than introducing a new global variable which may look something like; >>> >>> Not a global variable. A function in EAL layer. A variable in PMD priv. >>> >> >> If we agreed to use condition (drv->kdrv == RTE_KDRV_VFIO); >> then resource parsing for vfio {including vfio and vfio_noiommu both >> case} is enforced in virtio pmd driver layer and that is contradicting >> to what we agreed earlier in this[1] thread. Also we don't need a >> function in EAL layer or a variable in PMD priv. Perhaps a private >> function in virtio pmd which does parsing for vfio interface. >> >> Thoughts? >> >> [1] http://dpdk.org/dev/patchwork/patch/9862/ >>
[dpdk-dev] [PATCH] tools: fix syntax errors and add support for Python 3
This patch fixes syntax errors from tools/setup.sh during binding ethernet device on systems where Python 3 is default. Backward compability with Python 2 is preserved. Signed-off-by: Dawid Jurczak --- tools/dpdk_nic_bind.py | 74 +- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/tools/dpdk_nic_bind.py b/tools/dpdk_nic_bind.py index f02454e..9f7c848 100755 --- a/tools/dpdk_nic_bind.py +++ b/tools/dpdk_nic_bind.py @@ -54,7 +54,7 @@ args = [] def usage(): '''Print usage information for the program''' argv0 = basename(sys.argv[0]) -print """ +print (""" Usage: -- @@ -110,7 +110,7 @@ To unbind :01:00.0 from using any driver To bind :02:00.0 and :02:00.1 to the ixgbe kernel driver %(argv0)s -b ixgbe 02:00.0 02:00.1 -""" % locals() # replace items from local variables +""" % locals()) # replace items from local variables # This is roughly compatible with check_output function in subprocess module # which is only available in python 2.7. @@ -156,7 +156,7 @@ def check_modules(): '''Checks that igb_uio is loaded''' global dpdk_drivers -fd = file("/proc/modules") +fd = open("/proc/modules", 'r') loaded_mods = fd.readlines() fd.close() @@ -176,10 +176,10 @@ def check_modules(): # check if we have at least one loaded module if True not in [mod["Found"] for mod in mods] and b_flag is not None: if b_flag in dpdk_drivers: -print "Error - no supported modules(DPDK driver) are loaded" +print ("Error - no supported modules(DPDK driver) are loaded") sys.exit(1) else: -print "Warning - no supported modules(DPDK driver) are loaded" +print ("Warning - no supported modules(DPDK driver) are loaded") # change DPDK driver list to only contain drivers that are loaded dpdk_drivers = [mod["Name"] for mod in mods if mod["Found"]] @@ -198,7 +198,7 @@ def get_pci_device_details(dev_id): for line in extra_info: if len(line) == 0: continue -name, value = line.split("\t", 1) +name, value = line.decode().split("\t", 1) name = name.strip(":") + "_str" device[name] = value # check for a unix interface name @@ -234,7 +234,7 @@ def get_nic_details(): dev["Device"] = int(dev["Device"],16) devices[dev["Slot"]] = dict(dev) # use dict to make copy of dev else: -name, value = dev_line.split("\t", 1) +name, value = dev_line.decode().split("\t", 1) dev[name.rstrip(":")] = value # check what is the interface if any for an ssh connection if @@ -243,17 +243,17 @@ def get_nic_details(): route = check_output(["ip", "-o", "route"]) # filter out all lines for 169.254 routes route = "\n".join(filter(lambda ln: not ln.startswith("169.254"), - route.splitlines())) + route.decode().splitlines())) rt_info = route.split() -for i in xrange(len(rt_info) - 1): +for i in range(len(rt_info) - 1): if rt_info[i] == "dev": ssh_if.append(rt_info[i+1]) # based on the basic info, get extended text details for d in devices.keys(): # get additional info and add it to existing data -devices[d] = dict(devices[d].items() + - get_pci_device_details(d).items()) +devices[d] = devices[d].copy() +devices[d].update(get_pci_device_details(d).items()) for _if in ssh_if: if _if in devices[d]["Interface"].split(","): @@ -293,22 +293,22 @@ def dev_id_from_dev_name(dev_name): if dev_name in devices[d]["Interface"].split(","): return devices[d]["Slot"] # if nothing else matches - error -print "Unknown device: %s. " \ -"Please specify device in \"bus:slot.func\" format" % dev_name +print ("Unknown device: %s. " \ +"Please specify device in \"bus:slot.func\" format" % dev_name) sys.exit(1) def unbind_one(dev_id, force): '''Unbind the device identified by "dev_id" from its current driver''' dev = devices[dev_id] if not has_driver(dev_id): -print "%s %s %s is not currently managed by any driver\n" % \ -(dev["Slot"], dev["Device_str"], dev["Interface"]) +print ("%s %s %s is not currently managed by any driver\n" % \ +(dev["Slot"], dev["Device_str"], dev["Interface"])) return # prevent us disconnecting ourselves if dev["Ssh_if"] and not force: -print "Routing table indicates that interface %s is active" \ -". Skipping unbind" % (dev_id) +print ("Routing table indicates that interface %s is active" \ +". Skipping unbind" % (dev_id)) return # write to /sys to unbind @@ -316,7 +316,7 @@ def unbind_one(dev_id,
[dpdk-dev] DPDK mbuf pool in SR-IOV env and one RX/TX queue
Any clues or hint on how to debug this kind of problem on SR-IOV? Only primary can send packet but secondary process couldn't. I have verified host's qprc and qptc counters on PF and they do increment. SR-IOV with DPDK seems more challenging than PCI pass through of whole NIC. Saurabh On Jan 26, 2016 12:19 PM, "Bruce Richardson" wrote: > On Mon, Jan 25, 2016 at 04:15:28PM -0800, Saurabh Mishra wrote: > > Hi Bruce -- > > > > >The sharing of the mbuf pool is not an issue, but sharing of rx/tx > queues > > is. > > >The ethdev queues are not multi-thread safe, so to share a queue between > > processes > > >or threads, you need to put in locks or other access control mechanisms. > > [This > > >also implies a performance hit due to the locking] > > >Regards, > > >/Bruce > > > > Right. So now we have only one process to do rx/tx on queue 0 if we > detect > > that max queue support is 1. > > > > However, we have noticed that if our process, which does rx/tx, is not > > primary, then we can't transmit the packet out with SR-IOV. > > > > Is there any specific limitation on SR-IOV (the vf driver in dpdk) that > > only primary process should receive and transmit packets? > > > > In our model, we have an agent process which monitor links and another > > process which does packet processing. If we make our agent process as > > primary then our secondary process is not able to send the packets -- > > rte_eth_tx_burst() succeed but recipient does not receive the packet. > > > > Thanks, > > /Saurabh > > There should be no restrictions on RX/TX from secondary processes. > > /Bruce >
[dpdk-dev] [RFC] eal: add cgroup-aware resource self discovery
Hi Neil, On 1/26/2016 10:19 PM, Neil Horman wrote: > On Tue, Jan 26, 2016 at 10:22:18AM +0800, Tan, Jianfeng wrote: >> Hi Neil, >> >> On 1/25/2016 9:46 PM, Neil Horman wrote: >>> On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote: >> ... -- 2.1.4 >>> This doesn't make a whole lot of sense, for several reasons: >>> >>> 1) Applications, as a general rule shouldn't be interrogating the cgroups >>> interface at all. >> The main reason to do this in DPDK is that DPDK obtains resource information >> from sysfs and proc, which are not well containerized so far. And DPDK >> pre-allocates resource instead of on-demand gradual allocating. >> > Not disagreeing with this, just suggesting that: > > 1) Interrogating cgroups really isn't the best way to collect that information > 2) Pre-allocating those resources isn't particularly wise without some > mechanism > to reallocate it, as resource constraints can change (consider your cpuset > getting rewritten) In the case of reallocate, For cpuset, DPDK panics in the initialization if set_affinity fails, but after that, cpuset rewritten will not bring any problem I believe. For memory, a running application uses 2G hugepages, then admin decreases hugetlb cgroup into 1G, the application will not get killed, unless it tries to access more hugepages (I'll double check this). So another way to address this problem is to add an option that DPDK tries best to allocate those resources, and if fails, it just posts a warning and uses those allocated resources, instead of panic. What do you think? > >>> 2) Cgroups aren't the only way in which a cpuset or memoryset can be >>> restricted >>> (the isolcpus command line argument, or a taskset on a parent process for >>> instance, but there are several others). >> Yes, I agree. To enable that, I'd like design the new API for resource self >> discovery in a flexible way. A parameter "type" is used to specify the >> solution to discovery way. In addition, I'm considering to add a callback >> function pointer so that users can write their own resource discovery >> functions. >> > Why? You don't need an API for this, or if you really want one, it can be > very > generic if you use POSIX apis to gather the information. What you have here > is > going to be very linux specific, and will need reimplementing for BSD or other > operating systems. To use the cpuset example, instead of reading and parsing > the mask files in the cgroup filesystem module to find your task and > corresponding mask, just call sched_setaffinity with an all f's mask, then > call > sched_getaffinity. The returned mask will be all the cpus your process is > allowed to execute on, taking into account every limiting filter the system > you > are running on offers. Yes, it makes sense on cpu's side. > > There are simmilar OS level POSIX apis for most resources out there. You > really > don't need to dig through cgroups just to learn what some of those reources > are. > >>> Instead of trying to figure out what cpuset is valid for your process by >>> interrogating the cgroups heirarchy, instead you should follow the >>> proscribed >>> method of calling sched_getaffinity after calling sched_setaffinity. That >>> will >>> give you the canonical cpuset that you are executing on, taking all cpuset >>> filters into account (including cgroups and any other restrictions). Its >>> far >>> simpler as well, as it doesn't require a ton of file/string processing. >> Yes, this way is much better for cpuset discovery. But is there such a >> syscall for hugepages? >> > In what capacity? Interrogating how many hugepages you have, or to what node > they are affined to? Capacity would require reading the requisite proc file, > as > theres no posix api for this resource. Node affinity can be implied by > setting > the numa policy of the dpdk and then writing to /proc/nr_hugepages, as the > kernel will attempt to distribute hugepages evenly among the tasks' numa > policy > configuration. For memory affinity, I believe the existing way of reading /proc/self/pagemap already handle the problem. What I was asking is how much memory (or hugepages in Linux's case) can be used. By the way, what is /proc/nr_hugepages? > > That said, I would advise that you strongly consider not exporting hugepages > as > a resource, as: > > a) Applications generally don't need to know that they are using hugepages, > and > so they dont need to know where said hugepages live, they just allocate memory > via your allocation api and you give them something appropriate But the allocation api provider, DPDK library, needs to know if it's using hugepages or not. > b) Hugepages are a resource that are very specific to Linux, and to X86 Linux > at > that. Some OS implement simmilar resources, but they may have very different > semantics. And other Arches may or may not implement various forms of > compound > paging at all. As the DPDK expands to
[dpdk-dev] [PATCH v2] fix checkpatch errors
2016-01-27 01:26, Huawei Xie: > v2 changes: > add missed commit message in v1 > > fix the error reported by checkpatch: > "ERROR: return is not a function, parentheses are not required" > > also removed other extra parentheses like: > "return val == 0" > "return (rte_mempool_lookup(...))" How these examples are differents from above checkpatch error? Please add Fixes: 6307b909b8e0 ("lib: remove extra parenthesis after return") This is the second run after above commit but I still see a lot of them. Please check git grep 'return *('
[dpdk-dev] [PATCH] vfio/noiommu: Don't use iommu_present() to track fake groups
On Wed, Jan 27, 2016 at 6:51 PM, Burakov, Anatoly wrote: > Hi Alex, > >> On 01/23/2016 04:23 AM, Alex Williamson wrote: >> > Using iommu_present() to determine whether an IOMMU group is real or >> > fake has some problems. First, apparently Power systems don't >> > register an IOMMU on the device bus, so the groups and containers get >> > marked as noiommu and then won't bind to their actual IOMMU driver. >> > Second, I expect we'll run into the same issue as we try to support >> > vGPUs through vfio, since they're likely to emulate this behavior of >> > creating an IOMMU group on a virtual device and then providing a vfio >> > IOMMU backend tailored to the sort of isolation they provide, which >> > won't necessarily be fully compatible with the IOMMU API. >> > >> > The solution here is to use the existing iommudata interface to IOMMU >> > groups, which allows us to easily identify the fake groups we've >> > created for noiommu purposes. The iommudata we set is purely >> > arbitrary since we're only comparing the address, so we use the >> > address of the noiommu switch itself. >> > >> > Reported-by: Alexey Kardashevskiy >> > Fixes: 03a76b60f8ba ("vfio: Include No-IOMMU mode") >> > Signed-off-by: Alex Williamson >> >> >> >> Reviewed-by: Alexey Kardashevskiy >> Tested-by: Alexey Kardashevskiy > > Tested bringing the NIC's up, encountered no issues. Curious if it also works > for Santosh (CC'd) as he's one of the intended users of the No-IOMMU > functionality, but otherwise seems to work. > Yes, Its works for virtio dpdk case too, Tested-by: Thanks. > Thanks, > Anatoly
[dpdk-dev] [PATCH] eal: add architecture specific rte_cpuflags.c files
2015-11-10 10:02, Ferruh Yigit: > --- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map > +++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map > @@ -133,5 +133,6 @@ DPDK_2.2 { > global: > > rte_intr_cap_multiple; > + cpu_feature_table; As it is now an exported symbol, it should be prefixed with rte_. Please take care when rebasing to - create a new block of symbols for the new release version, - and keep the new flag for AVX512 which is going to be applied.
[dpdk-dev] [PATCH v2 0/5] Optimize memcpy for AVX512 platforms
Hi Thomas, > -Original Message- > From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com] > Sent: Wednesday, January 27, 2016 3:31 PM > To: Richardson, Bruce; Ananyev, Konstantin > Cc: dev at dpdk.org; Wang, Zhihong > Subject: Re: [dpdk-dev] [PATCH v2 0/5] Optimize memcpy for AVX512 platforms > > > Zhihong Wang (5): > > lib/librte_eal: Identify AVX512 CPU flag > > mk: Predefine AVX512 macro for compiler > > lib/librte_eal: Optimize memcpy for AVX512 platforms > > app/test: Adjust alignment unit for memcpy perf test > > lib/librte_eal: Tune memcpy for prior platforms > > > > app/test/test_memcpy_perf.c| 6 + > > .../common/include/arch/x86/rte_cpuflags.h | 2 + > > .../common/include/arch/x86/rte_memcpy.h | 269 > > - > > mk/rte.cpuflags.mk | 4 + > > 4 files changed, 268 insertions(+), 13 deletions(-) > > The maintainers of arch/x86 are Bruce and Konstantin. > I guess there is no comment and we can apply this cool series? Yes, looks ok to me. Konstantin
[dpdk-dev] DPDK bnx2x driver link problem
Looks like bnx2x has link problem?sometime it sees link up and most of the time it see link down even though the RX/TX counters are going up. Has anybody seen this type of problem? If I don't use DPDK then I don't see this type of link related problem. The counter shows that it?s receiving and transmitting packets. I tried with ?n1, -n2 and ?n4. *1st Time:* [root@ ~]# ./symmetric_mp fakeelf -c 2 -m2048 -n4 --proc-type=primary -- -p 3 --num-procs=2 --proc-id=0 PMD: bnx2x_print_adapter_info(): Switch : 0 PMD: bnx2x_print_adapter_info(): === Checking link status..done Port 0 Link Up - speed 1 Mbps - full-duplex Port 1 Link Down APP: Finished Process Init. Lcore 1 using ports 0 1 lcore 1 using queue 0 of each port ^C Exiting on signal 2 Port 0: RX - 3, TX - 27, Drop - 0 Port 1: RX - 27, TX - 3, Drop - 0 *Second time:* [root@ ~]# ./symmetric_mp fakeelf -c 2 -m2048 -n4 --proc-type=primary -- -p 3 --num-procs=2 --proc-id=0 PMD: bnx2x_print_adapter_info(): Switch : 0 PMD: bnx2x_print_adapter_info(): === Checking link status..done Port 0 Link Down Port 1 Link Down APP: Finished Process Init. Lcore 1 using ports 0 1 lcore 1 using queue 0 of each port ^C Exiting on signal 2 Port 0: RX - 3, TX - 17, Drop - 0 Port 1: RX - 17, TX - 3, Drop - 0 [root@ ~]# *Third time:* [root@ ~]# ./symmetric_mp fakeelf -c 2 -m2048 -n4 --proc-type=primary -- -p 3 --num-procs=2 --proc-id=0 Checking link status..done Port 0 Link Down Port 1 Link Down APP: Finished Process Init. Lcore 1 using ports 0 1 lcore 1 using queue 0 of each port ^C Exiting on signal 2 Port 0: RX - 8, TX - 84, Drop - 0 Port 1: RX - 84, TX - 8, Drop - 0 [root@~]# *4th time:* Checking link status..done Port 0 Link Down Port 1 Link Down APP: Finished Process Init. Lcore 1 using ports 0 1 lcore 1 using queue 0 of each port ^C Exiting on signal 2 Port 0: RX - 2, TX - 14, Drop - 0 Port 1: RX - 14, TX - 2, Drop - 0 [root@ ~]#
[dpdk-dev] [PATCH 4/4] examples/ip_pipeline: add packets dumping to PCAP file support
This patch add packet dumping feature to ip_pipeline. Output port type SINK now supports dumping packets to PCAP file before releasing mbuf back to mempool. This feature can be applied by specifying parameters in configuration file as shown below: [PIPELINE1] type = PASS-THROUGH core = 1 pktq_in = SOURCE0 SOURCE1 pktq_out = SINK0 SINK1 pcap_file_wr = /path/to/eth1.pcap /path/to/eth2.pcap pcap_n_pkt_wr = 80 0 The configuration section "pcap_file_wr" contains full path and name of the PCAP file which the packets will be dumped to. If multiple SINKs exists, each shall have its own PCAP file path listed in this section, separated by spaces. Multiple SINK ports shall NOT share same PCAP file to be dumped. The configuration section "pcap_n_pkt_wr" contains integer value(s) and indicates the maximum number of packets to be dumped to the PCAP file. If this value is "0", the "infinite" dumping mode will be used. If this value is N (N > 0), the dumping will be finished when the number of packets dumped to the file reaches N. To enable PCAP dumping support to IP pipeline, the compiler option CONFIG_RTE_PORT_PCAP must be set to 'y'. It is possible to disable this feature by removing "pcap_file_wr" and "pcap_n_pkt_wr" lines from the configuration file. Signed-off-by: Fan Zhang Acked-by: Cristian Dumitrescu --- examples/ip_pipeline/app.h | 2 + examples/ip_pipeline/config_parse.c | 159 examples/ip_pipeline/init.c | 11 +++ examples/ip_pipeline/pipeline_be.h | 2 + 4 files changed, 174 insertions(+) diff --git a/examples/ip_pipeline/app.h b/examples/ip_pipeline/app.h index 9dbe668..144fab8 100644 --- a/examples/ip_pipeline/app.h +++ b/examples/ip_pipeline/app.h @@ -155,6 +155,8 @@ struct app_pktq_source_params { struct app_pktq_sink_params { char *name; uint8_t parsed; + char *file_name; /* Full path of PCAP file to be copied to mbufs */ + uint32_t n_pkts_to_dump; }; struct app_msgq_params { diff --git a/examples/ip_pipeline/config_parse.c b/examples/ip_pipeline/config_parse.c index f0bed81..9f5b974 100644 --- a/examples/ip_pipeline/config_parse.c +++ b/examples/ip_pipeline/config_parse.c @@ -184,6 +184,8 @@ struct app_pktq_source_params default_source_params = { struct app_pktq_sink_params default_sink_params = { .parsed = 0, + .file_name = NULL, + .n_pkts_to_dump = 0, }; struct app_msgq_params default_msgq_params = { @@ -1003,6 +1005,83 @@ parse_pipeline_pcap_source(struct app_params *app, } static int +parse_pipeline_pcap_sink(struct app_params *app, + struct app_pipeline_params *p, + const char *file_name, const char *n_pkts_to_dump) +{ + const char *next = NULL; + char *end; + uint32_t i; + int parse_file = 0; + + if (file_name && !n_pkts_to_dump) { + next = file_name; + parse_file = 1; /* parse file path */ + } else if (n_pkts_to_dump && !file_name) { + next = n_pkts_to_dump; + parse_file = 0; /* parse copy size */ + } else + return -EINVAL; + + char name[APP_PARAM_NAME_SIZE]; + size_t name_len; + + if (p->n_pktq_out == 0) + return -EINVAL; + + for (i = 0; i < p->n_pktq_out; i++) { + if (p->pktq_out[i].type != APP_PKTQ_OUT_SINK) + return -EINVAL; + } + + i = 0; + while (*next != '\0') { + uint32_t id; + + if (i >= p->n_pktq_out) + return -EINVAL; + + id = p->pktq_out[i].id; + + end = strchr(next, ' '); + if (!end) + name_len = strlen(next); + else + name_len = end - next; + + if (name_len == 0 || name_len == sizeof(name)) + return -EINVAL; + + strncpy(name, next, name_len); + name[name_len] = '\0'; + next += name_len; + if (*next != '\0') + next++; + + if (parse_file) { + app->sink_params[id].file_name = strdup(name); + if (app->sink_params[id].file_name == NULL) + return -ENOMEM; + } else { + if (parser_read_uint32( + >sink_params[id].n_pkts_to_dump, + name) != 0) { + if (app->sink_params[id].file_name != NULL) + free(app->sink_params[id].file_name); + return -EINVAL; + } + } + + i++; + + if (i == p->n_pktq_out) + return 0; + } + + return -EINVAL; +} + +static int parse_pipeline_pktq_in(struct app_params *app, struct
[dpdk-dev] [PATCH 3/4] lib/librte_port: add packet dumping to PCAP file support in sink port
Originally, sink ports in librte_port releases received mbufs back to mempool. This patch adds optional packet dumping to PCAP feature in sink port: the packets will be dumped to user defined PCAP file for storage or debugging. The user may also choose the sink port's activity: either it continuously dump the packets to the file, or stops at certain dumping This feature shares same CONFIG_RTE_PORT_PCAP compiler option as source port PCAP file support feature. Users can enable or disable this feature by setting CONFIG_RTE_PORT_PCAP compiler option "y" or "n". Signed-off-by: Fan Zhang Acked-by: Cristian Dumitrescu --- lib/librte_port/rte_port_source_sink.c | 268 +++-- lib/librte_port/rte_port_source_sink.h | 11 +- 2 files changed, 263 insertions(+), 16 deletions(-) diff --git a/lib/librte_port/rte_port_source_sink.c b/lib/librte_port/rte_port_source_sink.c index 44fc0d5..2878014 100644 --- a/lib/librte_port/rte_port_source_sink.c +++ b/lib/librte_port/rte_port_source_sink.c @@ -37,6 +37,7 @@ #include #include #include +#include #ifdef RTE_PORT_PCAP #include @@ -345,12 +346,183 @@ rte_port_source_stats_read(void *port, struct rte_port_sink { struct rte_port_out_stats stats; + + /* PCAP dumper handle and pkts number */ + void *dumper; + uint32_t max_pkts; + uint32_t pkt_index; + uint32_t dump_finish; }; +#ifdef RTE_PORT_PCAP + +/** + * Open PCAP file for dumping packets to the file later + * + * @param port + * Handle to sink port + * @param p + * Sink port parameter + * @return + * 0 on SUCCESS + * error code otherwise + */ +static int +pcap_sink_open(struct rte_port_sink *port, + __rte_unused struct rte_port_sink_params *p) +{ + pcap_t *tx_pcap; + pcap_dumper_t *pcap_dumper; + + if (p->file_name == NULL) { + port->dumper = NULL; + port->max_pkts = 0; + port->pkt_index = 0; + port->dump_finish = 0; + return 0; + } + + /** Open a dead pcap handler for opening dumper file */ + tx_pcap = pcap_open_dead(DLT_EN10MB, 65535); + if (tx_pcap == NULL) + return -1; + + /* The dumper is created using the previous pcap_t reference */ + pcap_dumper = pcap_dump_open(tx_pcap, p->file_name); + if (pcap_dumper == NULL) + return -1; + + port->dumper = pcap_dumper; + port->max_pkts = p->max_n_pkts; + port->pkt_index = 0; + port->dump_finish = 0; + + return 0; +} + +uint8_t jumbo_pkt_buf[ETHER_MAX_JUMBO_FRAME_LEN]; + +/** + * Dump a packet to PCAP dumper + * + * @param p + * Handle to sink port + * @param mbuf + * Handle to mbuf structure holding the packet + */ +static void +pcap_sink_dump_pkt(struct rte_port_sink *port, struct rte_mbuf *mbuf) +{ + uint8_t *pcap_dumper = (uint8_t *)(port->dumper); + struct pcap_pkthdr pcap_hdr; + uint8_t *pkt; + + /* Maximum num packets already reached */ + if (port->dump_finish) + return; + + pkt = rte_pktmbuf_mtod(mbuf, uint8_t *); + + pcap_hdr.len = mbuf->pkt_len; + pcap_hdr.caplen = pcap_hdr.len; + gettimeofday(&(pcap_hdr.ts), NULL); + + if (mbuf->nb_segs > 1) { + struct rte_mbuf *jumbo_mbuf; + uint32_t pkt_index = 0; + + /* if packet size longer than ETHER_MAX_JUMBO_FRAME_LEN, +* ignore it. +*/ + if (mbuf->pkt_len > ETHER_MAX_JUMBO_FRAME_LEN) + return; + + for (jumbo_mbuf = mbuf; jumbo_mbuf != NULL; + jumbo_mbuf = jumbo_mbuf->next) { + rte_memcpy(_pkt_buf[pkt_index], + rte_pktmbuf_mtod(jumbo_mbuf, uint8_t *), + jumbo_mbuf->data_len); + pkt_index += jumbo_mbuf->data_len; + } + + jumbo_pkt_buf[pkt_index] = '\0'; + + pkt = jumbo_pkt_buf; + } + + pcap_dump(pcap_dumper, _hdr, pkt); + + port->pkt_index++; + + if ((port->max_pkts != 0) && (port->pkt_index >= port->max_pkts)) { + port->dump_finish = 1; + RTE_LOG(INFO, PORT, "Dumped %u packets to file\n", + port->pkt_index); + } + +} + +/** + * Flush pcap dumper + * + * @param dumper + * Handle to pcap dumper + */ + +static void +pcap_sink_flush_pkt(void *dumper) +{ + pcap_dumper_t *pcap_dumper = (pcap_dumper_t *)dumper; + + pcap_dump_flush(pcap_dumper); +} + +/** + * Close a PCAP dumper handle + * + * @param dumper + * Handle to pcap dumper + */ +static void +pcap_sink_close(void *dumper) +{ + pcap_dumper_t *pcap_dumper = (pcap_dumper_t *)dumper; + + pcap_dump_close(pcap_dumper); +} + +#else + +static int +pcap_sink_open(struct rte_port_sink *port, +
[dpdk-dev] [PATCH 2/4] examples/ip_pipeline: add PCAP file support
This patch add PCAP file support to ip_pipeline. Input port type SOURCE now supports loading specific PCAP file and sends the packets in it to pipeline instance. The packets are then released by SINK output port. This feature can be applied by specifying parameters in configuration file as shown below; [PIPELINE1] type = PASS-THROUGH core = 1 pktq_in = SOURCE0 SOURCE1 pktq_out = SINK0 SINK1 pcap_file_rd = /path/to/eth1.PCAP /path/to/eth2.PCAP pcap_bytes_rd_per_pkt = 0 64 The configuration section "pcap_file_rd" contains full path and name of the PCAP file to be loaded. If multiple SOURCEs exists, each shall have its own PCAP file path listed in this section, separated by spaces. Multiple SOURCE ports may share same PCAP file to be copied. The configuration section "pcap_bytes_rd_per_pkt" contains integer value and indicates the maximum number of bytes to be copied from each packet in the PCAP file. If this value is "0", all packets in the file will be copied fully; if the packet size is smaller than the assigned value, the entire packet is copied. Same as "pcap_file_rd", every SOURCE shall have its own maximum copy byte number. To enable PCAP support to IP pipeline, the compiler option CONFIG_RTE_PORT_PCAP must be set to 'y'. It is possible to disable PCAP support by removing "pcap_file_rd" and "pcap_bytes_rd_per_pkt" lines from the configuration file. Signed-off-by: Fan Zhang Acked-by: Cristian Dumitrescu --- examples/ip_pipeline/app.h | 2 + examples/ip_pipeline/config_parse.c | 102 +++- examples/ip_pipeline/init.c | 8 +++ 3 files changed, 110 insertions(+), 2 deletions(-) diff --git a/examples/ip_pipeline/app.h b/examples/ip_pipeline/app.h index 6510d6d..9dbe668 100644 --- a/examples/ip_pipeline/app.h +++ b/examples/ip_pipeline/app.h @@ -148,6 +148,8 @@ struct app_pktq_source_params { uint32_t parsed; uint32_t mempool_id; /* Position in the app->mempool_params array */ uint32_t burst; + char *file_name; /* Full path of PCAP file to be copied to mbufs */ + uint32_t n_bytes_per_pkt; }; struct app_pktq_sink_params { diff --git a/examples/ip_pipeline/config_parse.c b/examples/ip_pipeline/config_parse.c index 1bedbe4..f0bed81 100644 --- a/examples/ip_pipeline/config_parse.c +++ b/examples/ip_pipeline/config_parse.c @@ -178,6 +178,8 @@ struct app_pktq_source_params default_source_params = { .parsed = 0, .mempool_id = 0, .burst = 32, + .file_name = NULL, + .n_bytes_per_pkt = 0, }; struct app_pktq_sink_params default_sink_params = { @@ -924,6 +926,83 @@ parse_eal(struct app_params *app, } static int +parse_pipeline_pcap_source(struct app_params *app, + struct app_pipeline_params *p, + const char *file_name, const char *cp_size) +{ + const char *next = NULL; + char *end; + uint32_t i; + int parse_file = 0; + + if (file_name && !cp_size) { + next = file_name; + parse_file = 1; /* parse file path */ + } else if (cp_size && !file_name) { + next = cp_size; + parse_file = 0; /* parse copy size */ + } else + return -EINVAL; + + char name[APP_PARAM_NAME_SIZE]; + size_t name_len; + + if (p->n_pktq_in == 0) + return -EINVAL; + + for (i = 0; i < p->n_pktq_in; i++) { + if (p->pktq_in[i].type != APP_PKTQ_IN_SOURCE) + return -EINVAL; + } + + i = 0; + while (*next != '\0') { + uint32_t id; + + if (i >= p->n_pktq_in) + return -EINVAL; + + id = p->pktq_in[i].id; + + end = strchr(next, ' '); + if (!end) + name_len = strlen(next); + else + name_len = end - next; + + if (name_len == 0 || name_len == sizeof(name)) + return -EINVAL; + + strncpy(name, next, name_len); + name[name_len] = '\0'; + next += name_len; + if (*next != '\0') + next++; + + if (parse_file) { + app->source_params[id].file_name = strdup(name); + if (app->source_params[id].file_name == NULL) + return -ENOMEM; + } else { + if (parser_read_uint32( + >source_params[id].n_bytes_per_pkt, + name) != 0) { + if (app->source_params[id].file_name != NULL) + free(app->source_params[id].file_name); + return -EINVAL; + } + } + + i++; + + if (i == p->n_pktq_in) + return 0; + } + +
[dpdk-dev] [PATCH 0/4] Add PCAP support to source and sink port
This patchset adds feature to source and sink type port in librte_port library, and to examples/ip_pipline. Originally, source/sink ports act as input and output of NULL packets generator. This patchset enables them read from and write to specific PCAP file, to generate and dump packets. Acked-by: Cristian Dumitrescu Fan Zhang (4): lib/librte_port: add PCAP file support to source port example/ip_pipeline: add PCAP file support lib/librte_port: add packet dumping to PCAP file support in sink port examples/ip_pipeline: add packets dumping to PCAP file support config/common_bsdapp | 1 + config/common_linuxapp | 1 + examples/ip_pipeline/app.h | 4 + examples/ip_pipeline/config_parse.c| 261 ++- examples/ip_pipeline/init.c| 19 ++ examples/ip_pipeline/pipeline_be.h | 2 + lib/librte_port/Makefile | 4 + lib/librte_port/rte_port_source_sink.c | 458 +++-- lib/librte_port/rte_port_source_sink.h | 18 +- mk/rte.app.mk | 1 + 10 files changed, 751 insertions(+), 18 deletions(-) -- 2.5.0
[dpdk-dev] [PATCH 1/2] examples/ip_pipeline: CPU utilization measurement and
This patch adds CPU utilization measurement and rate computation to packet framework. The measurement is done by measuring the cycles spent while a thread pulls zero packet from RX queue. These cycles are treated as idle cycles (or headroom). The idle thread rate is updated once per second. Signed-off-by: Fan Zhang Acked-by: Cristian Dumitrescu --- examples/ip_pipeline/app.h| 7 examples/ip_pipeline/init.c | 5 +++ examples/ip_pipeline/thread.c | 81 +-- examples/ip_pipeline/thread.h | 13 +++ 4 files changed, 104 insertions(+), 2 deletions(-) diff --git a/examples/ip_pipeline/app.h b/examples/ip_pipeline/app.h index 6510d6d..2b134f1 100644 --- a/examples/ip_pipeline/app.h +++ b/examples/ip_pipeline/app.h @@ -263,6 +263,11 @@ struct app_thread_data { struct rte_ring *msgq_in; struct rte_ring *msgq_out; + + uint64_t time_updated; + uint64_t hz; + uint64_t headroom; + double headroom_rate; }; struct app_eal_params { @@ -421,6 +426,8 @@ struct app_eal_params { #define APP_MAX_CMDS 64 #endif +#define APP_THREAD_HEADROOM_STATS_COLLECT + struct app_params { /* Config */ char app_name[APP_APPNAME_SIZE]; diff --git a/examples/ip_pipeline/init.c b/examples/ip_pipeline/init.c index 186ca03..f4c1239 100644 --- a/examples/ip_pipeline/init.c +++ b/examples/ip_pipeline/init.c @@ -1379,6 +1379,11 @@ app_init_threads(struct app_params *app) t->timer_period = (rte_get_tsc_hz() * APP_THREAD_TIMER_PERIOD) / 1000; t->thread_req_deadline = time + t->timer_period; + t->headroom = 0; + t->headroom_rate = 0.0; + t->time_updated = time; + t->hz = rte_get_tsc_hz(); + t->msgq_in = app_thread_msgq_in_get(app, params->socket_id, params->core_id, diff --git a/examples/ip_pipeline/thread.c b/examples/ip_pipeline/thread.c index 78f1bd8..0e37a26 100644 --- a/examples/ip_pipeline/thread.c +++ b/examples/ip_pipeline/thread.c @@ -39,6 +39,36 @@ #include "app.h" #include "thread.h" +#ifdef APP_THREAD_HEADROOM_STATS_COLLECT + +static void +thread_headroom_measure_start(uint64_t *t0) +{ + *t0 = rte_rdtsc(); +} + +static void +thread_headroom_measure_stop(int n_pkts, + uint64_t t0, struct app_thread_data *t) +{ + if (n_pkts == 0) { + uint64_t t1 = rte_rdtsc(); + + t->headroom += t1 - t0; + } +} + +#else + +static void +thread_headroom_measure_start(uint64_t *t0) {} + +static void +thread_headroom_measure_stop(int n_pkts, + uint64_t t0, struct app_thread_data *t) {} + +#endif + static inline void * thread_msg_recv(struct rte_ring *r) { @@ -140,6 +170,17 @@ thread_pipeline_disable(struct app_thread_data *t, } static int +thread_headroom(struct app_thread_data *t, + void *req) +{ + struct thread_show_headroom_msg_rsp *rsp = req; + + rsp->headroom = t->headroom_rate; + + return 0; +} + +static int thread_msg_req_handle(struct app_thread_data *t) { void *msg_ptr; @@ -165,6 +206,14 @@ thread_msg_req_handle(struct app_thread_data *t) thread_msg_send(t->msgq_out, rsp); break; } + + case THREAD_MSG_REQ_HEADROOM: { + rsp->status = thread_headroom(t, + (struct thread_show_headroom_msg_req *) req); + thread_msg_send(t->msgq_out, rsp); + break; + } + default: break; } @@ -187,15 +236,23 @@ app_thread(void *arg) for (j = 0; j < n_regular; j++) { struct app_thread_pipeline_data *data = >regular[j]; struct pipeline *p = data->be; + uint64_t t0; + int n_pkts; - rte_pipeline_run(p->p); + thread_headroom_measure_start(); + n_pkts = rte_pipeline_run(p->p); + thread_headroom_measure_stop(n_pkts, t0, t); } /* Run custom pipelines */ for (j = 0; j < n_custom; j++) { struct app_thread_pipeline_data *data = >custom[j]; + uint64_t t0; + int n_pkts; - data->f_run(data->be); + thread_headroom_measure_start(); + n_pkts = data->f_run(data->be); + thread_headroom_measure_stop(n_pkts, t0, t); } /* Timer */ @@ -252,6 +309,26 @@ app_thread(void *arg) t_deadline = deadline; } + /* Timer
[dpdk-dev] [PATCH 0/2] Add CPU utilization to packet framework
This patchset adds CPU utilization rate computation and CLI command support to packet framework. The thread idle rate is updated once per second. User can use thread CLI command to display it. Fan Zhang (2): examples/ip_pipeline: CPU utilization measurement and rate computation examples/ip_pipeline: add CLI command to display CPU utilization rate examples/ip_pipeline/app.h | 7 +++ examples/ip_pipeline/init.c | 5 ++ examples/ip_pipeline/thread.c| 81 +++- examples/ip_pipeline/thread.h| 13 + examples/ip_pipeline/thread_fe.c | 114 +++ examples/ip_pipeline/thread_fe.h | 6 +++ 6 files changed, 224 insertions(+), 2 deletions(-) -- 2.5.0
[dpdk-dev] [PATCH v5 0/3] Handle SIGINT and SIGTERM in DPDK examples
2015-12-30 16:59, Zhihong Wang: > Zhihong Wang (3): > app/test-pmd: Handle SIGINT and SIGTERM in testpmd > examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd > examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd Applied, thanks
[dpdk-dev] [RFC PATCH 5/5] virtio: Extend virtio-net PMD to support container environment
On 1/21/2016 7:09 PM, Tetsuya Mukawa wrote: > +qtest_find_pci_device(struct qtest_session *s, uint16_t bus, uint8_t device) > +{ > + struct qtest_pci_device *dev; > + uint32_t val; > + > + val = qtest_pci_inl(s, bus, device, 0, 0); > + TAILQ_FOREACH(dev, >head, next) { > + if (val == ((uint32_t)dev->device_id << 16 | dev->vendor_id)) { > + dev->bus_addr = bus; > + dev->device_addr = device; > + return; > + } > + > + } > +} > + > +static int > +qtest_init_pci_devices(struct qtest_session *s) > +{ > + struct qtest_pci_device *dev; > + uint16_t bus; > + uint8_t device; > + int ret; > + > + /* Find devices */ > + bus = 0; > + do { > + device = 0; > + do { > + qtest_find_pci_device(s, bus, device); > + } while (device++ != NB_DEVICE - 1); > + } while (bus++ != NB_BUS - 1); Seems this scan of all the pci devices is very time consuming operation, and each scan involves socket communication. Do you measure how long it takes to do the pci devices initialization? > + > + /* Initialize devices */ > + TAILQ_FOREACH(dev, >head, next) { > + ret = dev->init(s, dev); > + if (ret != 0) > + return ret; > + } > + > + return 0;
[dpdk-dev] [PATCH v6 08/11] eal: pci: introduce RTE_KDRV_VFIO_NOIOMMUi driver mode
2016-01-27 21:02, Santosh Shukla: > 1. virtio currently works for vfio+noiommu and likely will work for > vfio+iommu in near future. > 2. So remove __noiommu suffix and always use default. > 3. Introduce vfio resource parsing global function, That function > suppose to do parsing for default vfio case and for vfio-noiommu case. > This function will be used by pmd drivers for resource parsing purpose > example virtio. > > Yuan won't be happy with 3) I guess, because he wanted to get rid of > interface parsing from pmd driver. > > Thomas, if 1/2/3/ addresses your concern then I'll spin the series, I agree with 1/ and 2/. Please, could you explain why 3/ is needed?
[dpdk-dev] [PATCH] ethdev: fix byte order inconsistence between fdir flow and mask
Fixed issue of byte order in ethdev library that the structure for setting fdir's mask and flow entry is inconsist and made inputs of mask be in big endian. fixes: 76c6f89e80d4 ("ixgbe: support new flow director masks") 2d4c1a9ea2ac ("ethdev: add new flow director masks") Reported-by: Yaacov Hazan Signed-off-by: Jingjing Wu --- app/test-pmd/cmdline.c | 6 ++--- doc/guides/rel_notes/release_2_3.rst | 6 + drivers/net/ixgbe/ixgbe_fdir.c | 47 ++-- lib/librte_ether/rte_eth_ctrl.h | 7 -- 4 files changed, 43 insertions(+), 23 deletions(-) diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c index 73298c9..13194c9 100644 --- a/app/test-pmd/cmdline.c +++ b/app/test-pmd/cmdline.c @@ -8687,13 +8687,13 @@ cmd_flow_director_mask_parsed(void *parsed_result, return; } - mask->vlan_tci_mask = res->vlan_mask; + mask->vlan_tci_mask = rte_cpu_to_be_16(res->vlan_mask); IPV4_ADDR_TO_UINT(res->ipv4_src, mask->ipv4_mask.src_ip); IPV4_ADDR_TO_UINT(res->ipv4_dst, mask->ipv4_mask.dst_ip); IPV6_ADDR_TO_ARRAY(res->ipv6_src, mask->ipv6_mask.src_ip); IPV6_ADDR_TO_ARRAY(res->ipv6_dst, mask->ipv6_mask.dst_ip); - mask->src_port_mask = res->port_src; - mask->dst_port_mask = res->port_dst; + mask->src_port_mask = rte_cpu_to_be_16(res->port_src); + mask->dst_port_mask = rte_cpu_to_be_16(res->port_dst); } cmd_reconfig_device_queue(res->port_id, 1, 1); diff --git a/doc/guides/rel_notes/release_2_3.rst b/doc/guides/rel_notes/release_2_3.rst index 99de186..28d0f27 100644 --- a/doc/guides/rel_notes/release_2_3.rst +++ b/doc/guides/rel_notes/release_2_3.rst @@ -19,6 +19,10 @@ Drivers Libraries ~ +* ** fix byte order inconsistence between fdir flow and mask ** + + Fixed issue in ethdev library that the structure for setting + fdir's mask and flow entry is inconsist in byte order. Examples @@ -39,6 +43,8 @@ API Changes ABI Changes --- +* The fields in The ethdev structures ``rte_eth_fdir_masks`` were + changed to be in big endian. Shared Library Versions --- diff --git a/drivers/net/ixgbe/ixgbe_fdir.c b/drivers/net/ixgbe/ixgbe_fdir.c index e03219b..7423b2d 100644 --- a/drivers/net/ixgbe/ixgbe_fdir.c +++ b/drivers/net/ixgbe/ixgbe_fdir.c @@ -309,6 +309,7 @@ fdir_set_input_mask_82599(struct rte_eth_dev *dev, uint32_t fdiripv6m; /* IPv6 source and destination masks. */ uint16_t dst_ipv6m = 0; uint16_t src_ipv6m = 0; + volatile uint32_t *reg; PMD_INIT_FUNC_TRACE(); @@ -322,16 +323,16 @@ fdir_set_input_mask_82599(struct rte_eth_dev *dev, /* use the L4 protocol mask for raw IPv4/IPv6 traffic */ fdirm |= IXGBE_FDIRM_L4P; - if (input_mask->vlan_tci_mask == 0x0FFF) + if (input_mask->vlan_tci_mask == rte_cpu_to_be_16(0x0FFF)) /* mask VLAN Priority */ fdirm |= IXGBE_FDIRM_VLANP; - else if (input_mask->vlan_tci_mask == 0xE000) + else if (input_mask->vlan_tci_mask == rte_cpu_to_be_16(0xE000)) /* mask VLAN ID */ fdirm |= IXGBE_FDIRM_VLANID; else if (input_mask->vlan_tci_mask == 0) /* mask VLAN ID and Priority */ fdirm |= IXGBE_FDIRM_VLANID | IXGBE_FDIRM_VLANP; - else if (input_mask->vlan_tci_mask != 0xEFFF) { + else if (input_mask->vlan_tci_mask != rte_cpu_to_be_16(0xEFFF)) { PMD_INIT_LOG(ERR, "invalid vlan_tci_mask"); return -EINVAL; } @@ -340,19 +341,26 @@ fdir_set_input_mask_82599(struct rte_eth_dev *dev, IXGBE_WRITE_REG(hw, IXGBE_FDIRM, fdirm); /* store the TCP/UDP port masks, bit reversed from port layout */ - fdirtcpm = reverse_fdir_bitmasks(input_mask->dst_port_mask, -input_mask->src_port_mask); + fdirtcpm = reverse_fdir_bitmasks( + rte_be_to_cpu_16(input_mask->dst_port_mask), + rte_be_to_cpu_16(input_mask->src_port_mask)); - /* write all the same so that UDP, TCP and SCTP use the same mask */ + /* write all the same so that UDP, TCP and SCTP use the same mask +* (little-endian) + */ IXGBE_WRITE_REG(hw, IXGBE_FDIRTCPM, ~fdirtcpm); IXGBE_WRITE_REG(hw, IXGBE_FDIRUDPM, ~fdirtcpm); IXGBE_WRITE_REG(hw, IXGBE_FDIRSCTPM, ~fdirtcpm); info->mask.src_port_mask = input_mask->src_port_mask; info->mask.dst_port_mask = input_mask->dst_port_mask; - /* Store source and destination IPv4 masks (big-endian) */ - IXGBE_WRITE_REG(hw, IXGBE_FDIRSIP4M, ~(input_mask->ipv4_mask.src_ip)); - IXGBE_WRITE_REG(hw, IXGBE_FDIRDIP4M, ~(input_mask->ipv4_mask.dst_ip)); + /* Store source
[dpdk-dev] [PATCH 2/2] kdp: add virtual PMD for kernel slow data path communication
This patch provides slow data path communication to the Linux kernel. Patch is based on librte_kni, and heavily re-uses it. The main difference is librte_kni library converted into a PMD, to provide ease of use for applications. Now any application can use slow path communication without any update in application, because of existing eal support for virtual PMD. Also this PMD supports two methods to send packets to the Linux, first one is custom FIFO implementation with help of KDP kernel module, second one is Linux in-kernel tun/tap support. PMD first checks for KDP kernel module, if fails it tries to create and use a tap interface. With FIFO method: PMD's rx_pkt_burst() get packets from FIFO, and tx_pkt_burst() puts packet to the FIFO. The corresponding Linux virtual network device driver code also gets/puts packets from FIFO as they are coming from hardware. With tun/tap method: no external kernel module required, PMD reads from and writes packets to the tap interface file descriptor. Tap interface has performance penalty against FIFO implementation. Signed-off-by: Ferruh Yigit --- config/common_linuxapp | 1 + doc/guides/nics/pcap_ring.rst | 125 - doc/guides/rel_notes/release_2_3.rst| 6 + drivers/net/Makefile| 3 +- drivers/net/kdp/Makefile| 61 drivers/net/kdp/rte_eth_kdp.c | 481 drivers/net/kdp/rte_kdp.c | 365 drivers/net/kdp/rte_kdp.h | 126 + drivers/net/kdp/rte_kdp_fifo.h | 91 ++ drivers/net/kdp/rte_kdp_tap.c | 96 +++ drivers/net/kdp/rte_pmd_kdp_version.map | 4 + lib/librte_eal/common/include/rte_log.h | 3 +- mk/rte.app.mk | 3 +- 13 files changed, 1359 insertions(+), 6 deletions(-) create mode 100644 drivers/net/kdp/Makefile create mode 100644 drivers/net/kdp/rte_eth_kdp.c create mode 100644 drivers/net/kdp/rte_kdp.c create mode 100644 drivers/net/kdp/rte_kdp.h create mode 100644 drivers/net/kdp/rte_kdp_fifo.h create mode 100644 drivers/net/kdp/rte_kdp_tap.c create mode 100644 drivers/net/kdp/rte_pmd_kdp_version.map diff --git a/config/common_linuxapp b/config/common_linuxapp index 73c91d8..b9dec0c 100644 --- a/config/common_linuxapp +++ b/config/common_linuxapp @@ -322,6 +322,7 @@ CONFIG_RTE_LIBRTE_PMD_NULL=y # # Compile KDP PMD # +CONFIG_RTE_LIBRTE_PMD_KDP=y CONFIG_RTE_KDP_KMOD=y CONFIG_RTE_KDP_PREEMPT_DEFAULT=y diff --git a/doc/guides/nics/pcap_ring.rst b/doc/guides/nics/pcap_ring.rst index 46aa3ac..78b7b61 100644 --- a/doc/guides/nics/pcap_ring.rst +++ b/doc/guides/nics/pcap_ring.rst @@ -28,11 +28,11 @@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -Libpcap and Ring Based Poll Mode Drivers - +Software Poll Mode Drivers +== In addition to Poll Mode Drivers (PMDs) for physical and virtual hardware, -the DPDK also includes two pure-software PMDs. These two drivers are: +the DPDK also includes pure-software PMDs. These drivers are: * A libpcap -based PMD (librte_pmd_pcap) that reads and writes packets using libpcap, - both from files on disk, as well as from physical NIC devices using standard Linux kernel drivers. @@ -40,6 +40,10 @@ the DPDK also includes two pure-software PMDs. These two drivers are: * A ring-based PMD (librte_pmd_ring) that allows a set of software FIFOs (that is, rte_ring) to be accessed using the PMD APIs, as though they were physical NICs. +* A slow data path PMD (librte_pmd_kdp) that allows send/get packets to/from OS network +stack as it is a physical NIC. + + .. note:: The libpcap -based PMD is disabled by default in the build configuration files, @@ -211,6 +215,121 @@ Multiple devices may be specified, separated by commas. Done. +Kernel Data Path PMD + + +Kernel Data Path (KDP) PMD is to communicate with OS network stack easily by application. + +.. code-block:: console + +./testpmd --vdev eth_kdp0 --vdev eth_kdp1 -- -i +... +Configuring Port 0 (socket 0) +Port 0: 00:00:00:00:00:00 +Configuring Port 1 (socket 0) +Port 1: 00:00:00:00:00:00 +Checking link statuses... +Port 0 Link Up - speed 1 Mbps - full-duplex +Port 1 Link Up - speed 1 Mbps - full-duplex +Done + +KDP PMD supports two type of communication: + +* Custom FIFO implementation +* tun/tap implementation + +Custom FIFO implementation gives more performance but requires KDP kernel module (rte_kdp.ko) inserted. + +By default FIFO communication has priority, if KDP kernel module is not inserted, tun/tap communication used. + +If KDP kernel module inserted, above testpmd command will create following virtual
[dpdk-dev] [PATCH 1/2] kdp: add kernel data path kernel module
This kernel module is based on KNI module, but this one is stripped version of it and only for data messages, no control functionality provided. FIFO implementation of the KNI is kept exact same, but ethtool related code removed and virtual network management related code simplified. This module contains kernel support to create network devices and this module has a simple driver for virtual network device, the driver simply puts/gets packets to/from FIFO instead of real hardware. FIFO is created owned by userspace application, which is for this case KDP PMD. In long term this patch intends to replace the KNI and KNI will be depreciated. Signed-off-by: Ferruh Yigit --- config/common_linuxapp | 8 +- lib/librte_eal/linuxapp/Makefile | 5 +- lib/librte_eal/linuxapp/eal/Makefile | 3 +- .../linuxapp/eal/include/exec-env/rte_kdp_common.h | 143 + lib/librte_eal/linuxapp/kdp/Makefile | 56 ++ lib/librte_eal/linuxapp/kdp/kdp_dev.h | 82 +++ lib/librte_eal/linuxapp/kdp/kdp_fifo.h | 91 lib/librte_eal/linuxapp/kdp/kdp_misc.c | 463 + lib/librte_eal/linuxapp/kdp/kdp_net.c | 573 + 9 files changed, 1421 insertions(+), 3 deletions(-) create mode 100644 lib/librte_eal/linuxapp/eal/include/exec-env/rte_kdp_common.h create mode 100644 lib/librte_eal/linuxapp/kdp/Makefile create mode 100644 lib/librte_eal/linuxapp/kdp/kdp_dev.h create mode 100644 lib/librte_eal/linuxapp/kdp/kdp_fifo.h create mode 100644 lib/librte_eal/linuxapp/kdp/kdp_misc.c create mode 100644 lib/librte_eal/linuxapp/kdp/kdp_net.c diff --git a/config/common_linuxapp b/config/common_linuxapp index 74bc515..73c91d8 100644 --- a/config/common_linuxapp +++ b/config/common_linuxapp @@ -1,6 +1,6 @@ # BSD LICENSE # -# Copyright(c) 2010-2015 Intel Corporation. All rights reserved. +# Copyright(c) 2010-2016 Intel Corporation. All rights reserved. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -320,6 +320,12 @@ CONFIG_RTE_LIBRTE_PMD_XENVIRT=n CONFIG_RTE_LIBRTE_PMD_NULL=y # +# Compile KDP PMD +# +CONFIG_RTE_KDP_KMOD=y +CONFIG_RTE_KDP_PREEMPT_DEFAULT=y + +# # Do prefetch of packet data within PMD driver receive function # CONFIG_RTE_PMD_PACKET_PREFETCH=y diff --git a/lib/librte_eal/linuxapp/Makefile b/lib/librte_eal/linuxapp/Makefile index d9c5233..e3f91a7 100644 --- a/lib/librte_eal/linuxapp/Makefile +++ b/lib/librte_eal/linuxapp/Makefile @@ -1,6 +1,6 @@ # BSD LICENSE # -# Copyright(c) 2010-2014 Intel Corporation. All rights reserved. +# Copyright(c) 2010-2016 Intel Corporation. All rights reserved. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -38,6 +38,9 @@ DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal ifeq ($(CONFIG_RTE_KNI_KMOD),y) DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += kni endif +ifeq ($(CONFIG_RTE_KDP_KMOD),y) +DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += kdp +endif ifeq ($(CONFIG_RTE_LIBRTE_XEN_DOM0),y) DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += xen_dom0 endif diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile index 26eced5..ac72aea 100644 --- a/lib/librte_eal/linuxapp/eal/Makefile +++ b/lib/librte_eal/linuxapp/eal/Makefile @@ -1,6 +1,6 @@ # BSD LICENSE # -# Copyright(c) 2010-2015 Intel Corporation. All rights reserved. +# Copyright(c) 2010-2016 Intel Corporation. All rights reserved. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -116,6 +116,7 @@ CFLAGS_eal_thread.o += -Wno-return-type endif INC := rte_interrupts.h rte_kni_common.h rte_dom0_common.h +INC += rte_kdp_common.h SYMLINK-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP)-include/exec-env := \ $(addprefix include/exec-env/,$(INC)) diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kdp_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kdp_common.h new file mode 100644 index 000..0c77f58 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kdp_common.h @@ -0,0 +1,143 @@ +/*- + * This file is provided under a dual BSD/LGPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GNU LESSER GENERAL PUBLIC LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have
[dpdk-dev] [PATCH 0/2] slow data path communication between DPDK port and Linux
This is slow data path communication implementation based on existing KNI. Difference is: librte_kni converted into a PMD, kdp kernel module is almost same except all control path functionality removed and some simplification done. Motivation is to simplify slow path data communication. Now any application can use this new PMD to send/get data to Linux kernel. PMD supports two communication methods: 1) KDP kernel module PMD initialization functions handles creating virtual interfaces (with help of kdp kernel module) and created FIFO. FIFO is used to share data between userspace and kernelspace. This is default method. 2) tun/tap module When KDP module is not inserted, PMD creates tap interface and transfers packets using tap interface. In long term this patch intends to replace the KNI and KNI will be depreciated. Sample usage: 1) Transfer any packet received from NIC that bound to DPDK, to the Linux kernel a) insert kdp kernel module insmod build/kmod/rte_kdp.ko b) bind NIC to the DPDK using dpdk_nic_bind.py c) ./testpmd --vdev eth_kdp0 c1) testpmd show two ports, one of them physical, other virtual ... Configuring Port 0 (socket 0) Port 0: 00:00:00:00:00:00 Configuring Port 1 (socket 0) ... Checking link statuses... Port 0 Link Up - speed 1 Mbps - full-duplex Port 1 Link Up - speed 1 Mbps - full-duplex Done c2) This will create "kdp0" Linux interface $ ip l show kdp0 21: kdp0:mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff d) Linux port can be used for data d1) $ ifconfig kdp0 1.0.0.2 $ ping 1.0.0.1 PING 1.0.0.1 (1.0.0.1) 56(84) bytes of data. 64 bytes from 1.0.0.1: icmp_seq=1 ttl=64 time=0.789 ms 64 bytes from 1.0.0.1: icmp_seq=2 ttl=64 time=0.881 ms d2) $ tcpdump -nn -i kdp0 tcpdump: verbose output suppressed, use -v or -vv for full protocol decode listening on kdp0, link-type EN10MB (Ethernet), capture size 262144 bytes 15:01:22.407506 IP 1.0.0.1 > 1.0.0.2: ICMP echo request, id 40016, seq 18, length 64 15:01:22.408521 IP 1.0.0.2 > 1.0.0.1: ICMP echo reply, id 40016, seq 18, length 64 2) Data travels between virtual Linux interfaces pass from DPDK application, application can alter data a) insert kdp kernel module insmod build/kmod/rte_kdp.ko b) No physical NIC involved c) ./testpmd --vdev eth_kdp0 --vdev eth_kdp1 c1) testpmd show two ports, both of them are virtual ... Configuring Port 0 (socket 0) Port 0: 00:00:00:00:00:00 Configuring Port 1 (socket 0) Port 1: 00:00:00:00:00:00 Checking link statuses... Port 0 Link Up - speed 1 Mbps - full-duplex Port 1 Link Up - speed 1 Mbps - full-duplex Done c2) This will create "kdp0" and "kdp1" Linux interfaces $ ip l show kdp0; ip l show kdp1 22: kdp0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff 23: kdp1: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff d) Data travel between virtual ports pass from DPDK application $ifconfig kdp0 1.0.0.1 $ifconfig kdp1 1.0.0.2 d1) $ ping 1.0.0.1 PING 1.0.0.1 (1.0.0.1) 56(84) bytes of data. 64 bytes from 1.0.0.1: icmp_seq=1 ttl=64 time=3.57 ms 64 bytes from 1.0.0.1: icmp_seq=2 ttl=64 time=1.85 ms 64 bytes from 1.0.0.1: icmp_seq=3 ttl=64 time=1.89 ms d2) $ tcpdump -nn -i kdp0 tcpdump: verbose output suppressed, use -v or -vv for full protocol decode listening on kdp0, link-type EN10MB (Ethernet), capture size 262144 bytes 15:20:51.908543 IP 1.0.0.2 > 1.0.0.1: ICMP echo request, id 41234, seq 1, length 64 15:20:51.909570 IP 1.0.0.1 > 1.0.0.2: ICMP echo reply, id 41234, seq 1, length 64 15:20:52.909551 IP 1.0.0.2 > 1.0.0.1: ICMP echo request, id 41234, seq 2, length 64 15:20:52.910577 IP 1.0.0.1 > 1.0.0.2: ICMP echo reply, id 41234, seq 2, length 64 3) tun/tap interface usage a) No external module required, tun/tap support in kernel required b) ./testpmd --vdev eth_kdp0 --vdev eth_kdp1 b1) This will create "tap_kdp0" and "tap_kdp1" Linux interfaces $ ip l show tap_kdp0; ip l show tap_kdp1 25: tap_kdp0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 500 link/ether 56:47:97:9c:03:8e brd ff:ff:ff:ff:ff:ff 26: tap_kdp1: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 500 link/ether 5e:15:22:b0:52:42 brd ff:ff:ff:ff:ff:ff Ferruh Yigit (2): kdp: add kernel data path kernel module kdp: add virtual PMD for kernel slow data path communication config/common_linuxapp | 9 +- doc/guides/nics/pcap_ring.rst | 125 - doc/guides/rel_notes/release_2_3.rst | 6 + drivers/net/Makefile | 3 +- drivers/net/kdp/Makefile | 61 +++ drivers/net/kdp/rte_eth_kdp.c | 481
[dpdk-dev] [PATCH v2 0/5] Optimize memcpy for AVX512 platforms
> Zhihong Wang (5): > lib/librte_eal: Identify AVX512 CPU flag > mk: Predefine AVX512 macro for compiler > lib/librte_eal: Optimize memcpy for AVX512 platforms > app/test: Adjust alignment unit for memcpy perf test > lib/librte_eal: Tune memcpy for prior platforms > > app/test/test_memcpy_perf.c| 6 + > .../common/include/arch/x86/rte_cpuflags.h | 2 + > .../common/include/arch/x86/rte_memcpy.h | 269 > - > mk/rte.cpuflags.mk | 4 + > 4 files changed, 268 insertions(+), 13 deletions(-) The maintainers of arch/x86 are Bruce and Konstantin. I guess there is no comment and we can apply this cool series?
[dpdk-dev] [PATCH v4] vfio: Support for no-IOMMU mode
Hi Thomas, > > Is it possible (is it better) to declare these functions with > > vfio_dma_func_t? > > Yeah, sure. Or maybe the other way around - maybe we could do away with > the typedef. I'll go for the former though. No, we can't declare the functions with a function pointer. At least I don't see any obvious way to do that without incurring multiple declarations compile error. So I'll leave it as forward declarations. Of course, the other alternative is to put the array below the functions and make them static, to avoid forward declarations, but I think it's much clearer the way it is now. Thanks, Anatoly
[dpdk-dev] [PATCH 3/3] examples/ethtool: add control interface support to the application
Control interface APIs added into the sample application. To have the support corresponding kernel module (KCP) needs to be inserted. If kernel module is not there, application will run as it is without kernel control path support. When KCP module inserted, running application creates a virtual Linux network interface (dpdk$) per DPDK port. This interface can be used by traditional Linux tools. Signed-off-by: Ferruh Yigit --- doc/guides/sample_app_ug/ethtool.rst | 41 examples/ethtool/ethtool-app/main.c | 10 +++-- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/doc/guides/sample_app_ug/ethtool.rst b/doc/guides/sample_app_ug/ethtool.rst index 4d1697e..2174288 100644 --- a/doc/guides/sample_app_ug/ethtool.rst +++ b/doc/guides/sample_app_ug/ethtool.rst @@ -131,6 +131,47 @@ application`_. Individual call-back functions handle the detail associated with each command, which make use of the functions defined in the `Ethtool interface`_ to the DPDK functions. +Control Interface +~ + +If Kernel Control Path (KCP) kernel module (rte_kcp.ko) inserted, +virtual interfaces created for each DPDK port for control purposes. + +Created interfaces are named as dpdk#, like: + +.. code-block:: console + +# ifconfig dpdk0; ifconfig dpdk1 +dpdk0: flags=4099mtu 1500 +ether 90:e2:ba:0e:49:b9 txqueuelen 1000 (Ethernet) +RX packets 0 bytes 0 (0.0 B) +RX errors 0 dropped 0 overruns 0 frame 0 +TX packets 0 bytes 0 (0.0 B) +TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 + +dpdk1: flags=4099 mtu 1500 +ether 00:1b:21:76:fa:21 txqueuelen 1000 (Ethernet) +RX packets 0 bytes 0 (0.0 B) +RX errors 0 dropped 0 overruns 0 frame 0 +TX packets 0 bytes 0 (0.0 B) +TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 + +Regular Linux commands can be issued on interfaces: + +.. code-block:: console + +# ethtool -i dpdk0 +driver: rte_ixgbe_pmd +version: RTE 2.3.0-rc0 +firmware-version: +expansion-rom-version: +bus-info: :08:00.1 +supports-statistics: yes +supports-test: no +supports-eeprom-access: yes +supports-register-dump: yes +supports-priv-flags: no + Ethtool interface - diff --git a/examples/ethtool/ethtool-app/main.c b/examples/ethtool/ethtool-app/main.c index e21abcd..68b13ad 100644 --- a/examples/ethtool/ethtool-app/main.c +++ b/examples/ethtool/ethtool-app/main.c @@ -1,7 +1,7 @@ /*- * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. All rights reserved. + * Copyright(c) 2016 Intel Corporation. All rights reserved. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -44,6 +44,7 @@ #include #include #include +#include #include "ethapp.h" @@ -54,7 +55,6 @@ #define PKTPOOL_EXTRA_SIZE 512 #define PKTPOOL_CACHE 32 - struct txq_port { uint16_t cnt_unsent; struct rte_mbuf *buf_frames[MAX_BURST_LENGTH]; @@ -254,6 +254,8 @@ static int slave_main(__attribute__((unused)) void *ptr_data) } rte_spinlock_unlock(_port->lock); } /* end for( idx_port ) */ + rte_eth_control_interface_process_msg( + RTE_ETHTOOL_CTRL_IF_PROCESS_MSG, 0); } /* end for(;;) */ return 0; @@ -293,6 +295,8 @@ int main(int argc, char **argv) id_core = rte_get_next_lcore(id_core, 1, 1); rte_eal_remote_launch(slave_main, NULL, id_core); + rte_eth_control_interface_create(); + ethapp_main(); app_cfg.exit_now = 1; @@ -301,5 +305,7 @@ int main(int argc, char **argv) return -1; } + rte_eth_control_interface_destroy(); + return 0; } -- 2.5.0
[dpdk-dev] [PATCH 2/3] rte_ctrl_if: add control interface library
This library gets control messages form kernelspace and forwards them to librte_ether and returns response back to the kernelspace. Library does: 1) Trigger Linux virtual interface creation 2) Initialize the netlink socket communication 3) Provides process() API to the application that does processing the received messages This library requires corresponding kernel module to be inserted. Signed-off-by: Ferruh Yigit --- config/common_linuxapp | 3 +- doc/api/doxy-api-index.md | 3 +- doc/api/doxy-api.conf | 1 + doc/guides/rel_notes/release_2_3.rst | 9 + lib/Makefile | 3 +- lib/librte_ctrl_if/Makefile| 58 + lib/librte_ctrl_if/rte_ctrl_if.c | 162 + lib/librte_ctrl_if/rte_ctrl_if.h | 115 ++ lib/librte_ctrl_if/rte_ctrl_if_version.map | 9 + lib/librte_ctrl_if/rte_ethtool.c | 354 + lib/librte_ctrl_if/rte_ethtool.h | 54 + lib/librte_ctrl_if/rte_nl.c| 259 + lib/librte_ctrl_if/rte_nl.h| 60 + lib/librte_eal/common/include/rte_log.h| 3 +- mk/rte.app.mk | 3 +- 15 files changed, 1091 insertions(+), 5 deletions(-) create mode 100644 lib/librte_ctrl_if/Makefile create mode 100644 lib/librte_ctrl_if/rte_ctrl_if.c create mode 100644 lib/librte_ctrl_if/rte_ctrl_if.h create mode 100644 lib/librte_ctrl_if/rte_ctrl_if_version.map create mode 100644 lib/librte_ctrl_if/rte_ethtool.c create mode 100644 lib/librte_ctrl_if/rte_ethtool.h create mode 100644 lib/librte_ctrl_if/rte_nl.c create mode 100644 lib/librte_ctrl_if/rte_nl.h diff --git a/config/common_linuxapp b/config/common_linuxapp index 5d5e3e4..f72ba0e 100644 --- a/config/common_linuxapp +++ b/config/common_linuxapp @@ -1,6 +1,6 @@ # BSD LICENSE # -# Copyright(c) 2010-2015 Intel Corporation. All rights reserved. +# Copyright(c) 2010-2016 Intel Corporation. All rights reserved. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -507,6 +507,7 @@ CONFIG_RTE_KNI_VHOST_DEBUG_TX=n # CONFIG_RTE_KCP_KMOD=y CONFIG_RTE_KCP_KO_DEBUG=n +CONFIG_RTE_LIBRTE_CTRL_IF=y # # Compile vhost library diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md index 7a91001..214d16e 100644 --- a/doc/api/doxy-api-index.md +++ b/doc/api/doxy-api-index.md @@ -149,4 +149,5 @@ There are many libraries, so their headers may be grouped by topics: [common] (@ref rte_common.h), [ABI compat] (@ref rte_compat.h), [keepalive] (@ref rte_keepalive.h), - [version](@ref rte_version.h) + [version](@ref rte_version.h), + [control interface] (@ref rte_ctrl_if.h) diff --git a/doc/api/doxy-api.conf b/doc/api/doxy-api.conf index 57e8b5d..fd69bf1 100644 --- a/doc/api/doxy-api.conf +++ b/doc/api/doxy-api.conf @@ -39,6 +39,7 @@ INPUT = doc/api/doxy-api-index.md \ lib/librte_cmdline \ lib/librte_compat \ lib/librte_cryptodev \ + lib/librte_ctrl_if \ lib/librte_distributor \ lib/librte_ether \ lib/librte_hash \ diff --git a/doc/guides/rel_notes/release_2_3.rst b/doc/guides/rel_notes/release_2_3.rst index 99de186..39725e4 100644 --- a/doc/guides/rel_notes/release_2_3.rst +++ b/doc/guides/rel_notes/release_2_3.rst @@ -4,6 +4,14 @@ DPDK Release 2.3 New Features +* **Control interface support added.** + + To enable controlling DPDK ports by common Linux tools. + Following modules added to DPDK: + + * librte_ctrl_if library + * librte_eal/linuxapp/kcp kernel module + Resolved Issues --- @@ -51,6 +59,7 @@ The libraries prepended with a plus sign were incremented in this version. librte_acl.so.2 librte_cfgfile.so.2 librte_cmdline.so.1 + + librte_ctrl_if.so.1 librte_distributor.so.1 librte_eal.so.2 librte_hash.so.2 diff --git a/lib/Makefile b/lib/Makefile index ef172ea..a50bc1e 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -1,6 +1,6 @@ # BSD LICENSE # -# Copyright(c) 2010-2015 Intel Corporation. All rights reserved. +# Copyright(c) 2010-2016 Intel Corporation. All rights reserved. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -58,6 +58,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_PORT) += librte_port DIRS-$(CONFIG_RTE_LIBRTE_TABLE) += librte_table DIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += librte_pipeline DIRS-$(CONFIG_RTE_LIBRTE_REORDER) += librte_reorder +DIRS-$(CONFIG_RTE_LIBRTE_CTRL_IF) += librte_ctrl_if ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y) DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni diff --git
[dpdk-dev] [PATCH 1/3] kcp: add kernel control path kernel module
This kernel module is based on KNI module, but this one is stripped version of it and only for control messages, no data transfer functionality provided. This Linux kernel module helps userspace application create virtual interfaces and when a control command issued into that virtual interface, module pushes the command to the userspace and gets the response back for the caller application. The Linux tools like ethtool/ifconfig/ip can be used on virtual interfaces but not ones for related data, like tcpdump. In long term this patch intends to replace the KNI and KNI will be depreciated. Signed-off-by: Ferruh Yigit --- config/common_linuxapp | 6 + lib/librte_eal/linuxapp/Makefile | 5 +- lib/librte_eal/linuxapp/eal/Makefile | 3 +- .../linuxapp/eal/include/exec-env/rte_kcp_common.h | 86 +++ lib/librte_eal/linuxapp/kcp/Makefile | 58 + lib/librte_eal/linuxapp/kcp/kcp_dev.h | 65 + lib/librte_eal/linuxapp/kcp/kcp_ethtool.c | 261 +++ lib/librte_eal/linuxapp/kcp/kcp_misc.c | 282 + lib/librte_eal/linuxapp/kcp/kcp_net.c | 209 +++ lib/librte_eal/linuxapp/kcp/kcp_nl.c | 194 ++ 10 files changed, 1167 insertions(+), 2 deletions(-) create mode 100644 lib/librte_eal/linuxapp/eal/include/exec-env/rte_kcp_common.h create mode 100644 lib/librte_eal/linuxapp/kcp/Makefile create mode 100644 lib/librte_eal/linuxapp/kcp/kcp_dev.h create mode 100644 lib/librte_eal/linuxapp/kcp/kcp_ethtool.c create mode 100644 lib/librte_eal/linuxapp/kcp/kcp_misc.c create mode 100644 lib/librte_eal/linuxapp/kcp/kcp_net.c create mode 100644 lib/librte_eal/linuxapp/kcp/kcp_nl.c diff --git a/config/common_linuxapp b/config/common_linuxapp index 74bc515..5d5e3e4 100644 --- a/config/common_linuxapp +++ b/config/common_linuxapp @@ -503,6 +503,12 @@ CONFIG_RTE_KNI_VHOST_DEBUG_RX=n CONFIG_RTE_KNI_VHOST_DEBUG_TX=n # +# Compile librte_ctrl_if +# +CONFIG_RTE_KCP_KMOD=y +CONFIG_RTE_KCP_KO_DEBUG=n + +# # Compile vhost library # fuse-devel is needed to run vhost-cuse. # fuse-devel enables user space char driver development diff --git a/lib/librte_eal/linuxapp/Makefile b/lib/librte_eal/linuxapp/Makefile index d9c5233..d1fa3a3 100644 --- a/lib/librte_eal/linuxapp/Makefile +++ b/lib/librte_eal/linuxapp/Makefile @@ -1,6 +1,6 @@ # BSD LICENSE # -# Copyright(c) 2010-2014 Intel Corporation. All rights reserved. +# Copyright(c) 2010-2016 Intel Corporation. All rights reserved. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -38,6 +38,9 @@ DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal ifeq ($(CONFIG_RTE_KNI_KMOD),y) DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += kni endif +ifeq ($(CONFIG_RTE_KCP_KMOD),y) +DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += kcp +endif ifeq ($(CONFIG_RTE_LIBRTE_XEN_DOM0),y) DIRS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += xen_dom0 endif diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile index 26eced5..dded8cb 100644 --- a/lib/librte_eal/linuxapp/eal/Makefile +++ b/lib/librte_eal/linuxapp/eal/Makefile @@ -1,6 +1,6 @@ # BSD LICENSE # -# Copyright(c) 2010-2015 Intel Corporation. All rights reserved. +# Copyright(c) 2010-2016 Intel Corporation. All rights reserved. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -116,6 +116,7 @@ CFLAGS_eal_thread.o += -Wno-return-type endif INC := rte_interrupts.h rte_kni_common.h rte_dom0_common.h +INC += rte_kcp_common.h SYMLINK-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP)-include/exec-env := \ $(addprefix include/exec-env/,$(INC)) diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kcp_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kcp_common.h new file mode 100644 index 000..b3a6ee3 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kcp_common.h @@ -0,0 +1,86 @@ +/*- + * This file is provided under a dual BSD/LGPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GNU LESSER GENERAL PUBLIC LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; + * + * Contact Information: + * Intel Corporation + * + * + *
[dpdk-dev] [PATCH 0/3] Use common Linux tools to control DPDK ports
This work is to make DPDK ports more visible and to enable using common Linux tools to configure DPDK ports. Patch is based on KNI but contains only control functionality of it, also this patch does not include any Linux kernel network driver as part of it. Basically with the help of a kernel module (KCP), virtual Linux network interfaces named as "dpdk$" are created per DPDK port, control messages sent to these virtual interfaces are forwarded to DPDK, and response sent back to Linux application. Virtual interfaces created when DPDK application started and destroyed automatically when DPDK application terminated. Communication between kernel-space and DPDK done using netlink socket. Currently implementation is not complete, sample support added for the RFC, more functionality can be added based on community response. With this RFC Patch, supported: get/set mac address/mtu of DPDK devices, getting stats from DPDK devices and some set of ethtool commands. In long term this patch intends to replace the KNI and KNI will be depreciated. Samples: $ ifconfig dpdk0: flags=4099mtu 1500 ether 90:e2:ba:0e:49:b8 txqueuelen 1000 (Ethernet) RX packets 33 bytes 2058 (2.0 KiB) RX errors 0 dropped 0 overruns 0 frame 0 TX packets 33 bytes 2058 (2.0 KiB) TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 dpdk1: flags=4099 mtu 1500 ether 00:1b:21:76:fa:21 txqueuelen 1000 (Ethernet) RX packets 0 bytes 0 (0.0 B) RX errors 0 dropped 0 overruns 0 frame 0 TX packets 0 bytes 0 (0.0 B) TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 After some traffic on port 0: $ ifconfig dpdk0: flags=4099 mtu 1500 ether 90:e2:ba:0e:49:77 txqueuelen 1000 (Ethernet) RX packets 962 bytes 57798 (56.4 KiB) RX errors 0 dropped 0 overruns 0 frame 0 TX packets 962 bytes 57798 (56.4 KiB) TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 $ ethtool -i dpdk0 driver: rte_ixgbe_pmd version: RTE 2.3.0-rc0 firmware-version: expansion-rom-version: bus-info: :08:00.0 supports-statistics: yes supports-test: no supports-eeprom-access: yes supports-register-dump: yes supports-priv-flags: no $ ip l show dpdk0 25: dpdk0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 90:e2:ba:0e:49:b8 brd ff:ff:ff:ff:ff:ff $ ip l set dpdk0 addr 90:e2:ba:0e:49:77 $ ip l show dpdk0 25: dpdk0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 90:e2:ba:0e:49:77 brd ff:ff:ff:ff:ff:ff Ferruh Yigit (3): kcp: add kernel control path kernel module rte_ctrl_if: add control interface library examples/ethtool: add control interface support to the application config/common_linuxapp | 9 +- doc/api/doxy-api-index.md | 3 +- doc/api/doxy-api.conf | 1 + doc/guides/rel_notes/release_2_3.rst | 9 + doc/guides/sample_app_ug/ethtool.rst | 41 +++ examples/ethtool/ethtool-app/main.c| 10 +- lib/Makefile | 3 +- lib/librte_ctrl_if/Makefile| 58 lib/librte_ctrl_if/rte_ctrl_if.c | 162 ++ lib/librte_ctrl_if/rte_ctrl_if.h | 115 +++ lib/librte_ctrl_if/rte_ctrl_if_version.map | 9 + lib/librte_ctrl_if/rte_ethtool.c | 354 + lib/librte_ctrl_if/rte_ethtool.h | 54 lib/librte_ctrl_if/rte_nl.c| 259 +++ lib/librte_ctrl_if/rte_nl.h| 60 lib/librte_eal/common/include/rte_log.h| 3 +- lib/librte_eal/linuxapp/Makefile | 5 +- lib/librte_eal/linuxapp/eal/Makefile | 3 +- .../linuxapp/eal/include/exec-env/rte_kcp_common.h | 86 + lib/librte_eal/linuxapp/kcp/Makefile | 58 lib/librte_eal/linuxapp/kcp/kcp_dev.h | 65 lib/librte_eal/linuxapp/kcp/kcp_ethtool.c | 261 +++ lib/librte_eal/linuxapp/kcp/kcp_misc.c | 282 lib/librte_eal/linuxapp/kcp/kcp_net.c | 209 lib/librte_eal/linuxapp/kcp/kcp_nl.c | 194 +++ mk/rte.app.mk | 3 +- 26 files changed, 2307 insertions(+), 9 deletions(-) create mode 100644 lib/librte_ctrl_if/Makefile create mode 100644 lib/librte_ctrl_if/rte_ctrl_if.c create mode 100644 lib/librte_ctrl_if/rte_ctrl_if.h create mode 100644 lib/librte_ctrl_if/rte_ctrl_if_version.map create mode 100644 lib/librte_ctrl_if/rte_ethtool.c create mode 100644 lib/librte_ctrl_if/rte_ethtool.h
[dpdk-dev] [PATCH v2 0/5] Optimize memcpy for AVX512 platforms
2016-01-17 22:05, Zhihong Wang: > This patch set optimizes DPDK memcpy for AVX512 platforms, to make full > utilization of hardware resources and deliver high performance. On a related note, your expertise would be very valuable to review these patches please: (memcpy) http://dpdk.org/dev/patchwork/patch/4396/ (memcmp) http://dpdk.org/dev/patchwork/patch/4788/ Thanks
[dpdk-dev] [PATCH v6 08/11] eal: pci: introduce RTE_KDRV_VFIO_NOIOMMUi driver mode
On Tue, Jan 26, 2016 at 9:51 PM, Santosh Shukla wrote: > On Tue, Jan 26, 2016 at 7:58 PM, Thomas Monjalon > wrote: >> 2016-01-26 19:35, Santosh Shukla: >>> On Tue, Jan 26, 2016 at 6:30 PM, Thomas Monjalon >>> wrote: >>> > 2016-01-26 15:56, Santosh Shukla: >>> >> In my observation, currently virtio work for vfio-noiommu, that's why >>> >> said drv->kdrv need to know vfio mode. >>> > >>> > It is your observation. It may change in near future. >>> >>> so that mean till then, virtio support for non-x86 arch has to wait? >> >> No, absolutely not. virtio for non-x86 is welcome. >> >>> We have working model with vfio-noiommu, don't you think it make sense >>> to let vfio_noiommu implementation exist and later in-case >>> virtio+iommu gets mainline then switch to vfio __mode__ agnostic >>> approach. And for that All it takes to replace __noiommu suffix with >>> default. >> >> I'm just saying you should not touch the enum rte_kernel_driver. >> RTE_KDRV_VFIO is a driver. >> RTE_KDRV_VFIO_NOIOMMU is a mode. >> As the VFIO API is the same in both modes, there is no reason to >> distinguish them at this level. >> Your patch adds the NOIOMMU case everywhere: >> case RTE_KDRV_VFIO: >> + case RTE_KDRV_VFIO_NOIOMMU: >> >> I'll stop commenting here to let others give their opinion. >> >> [...] >>> >> with vfio+iommu; binding virtio pci device to vfio-pci driver fail; >>> >> giving below error: >>> >> [ 53.053464] VFIO - User Level meta-driver version: 0.3 >>> >> [ 73.077805] vfio-pci: probe of :00:03.0 failed with error -22 >>> >> [ 73.077852] vfio-pci: probe of :00:03.0 failed with error -22 >>> >> >>> >> vfio_pci_probe() --> vfio_iommu_group_get() --> iommu_group_get() >>> >> fails: iommu doesn't have group for virtio pci device. >>> > >>> > Yes it fails when binding. >>> > So the later check in the virtio PMD is useless. >>> >>> Which check? >> >> The check for VFIO noiommu only: >> - if (dev->kdrv == RTE_KDRV_VFIO) >> + if (dev->kdrv == RTE_KDRV_VFIO_NOIOMMU) >> >> [...] >>> > Furthermore restricting virtio to no-iommu mode doesn't bring >>> > any improvement. >>> >>> We're not __restricting__, as soon as virtio+iommu gets working state, >>> we'll simply replace __noiommu with default. Then its upto user to try >>> out virtio with vfio default or vfio_noiommu. >> >> Yes it's up to user. >> So your code should be >> if (dev->kdrv == RTE_KDRV_VFIO) >> > > Right, > >>> > That's why I suggest to keep the initial semantic of kdrv and >>> > not pollute it with VFIO modes. >>> >>> I am okay to live with default and forget suffix __noiommu but there >>> are implementation problem which was discussed in other thread >>> - Virtio pmd driver should avoid interface parsing i.e. >>> virtio_resource_init_uio/vfio() etc.. For vfio case - We could easily >>> get rid of by moving /sys parsing to pci_eal layer, Right? If so then >>> virtio currently works with vfio-noiommu, it make sense to me that >>> pci_eal layer does parsing for pmd driver before that pmd driver get >>> initialized. >> >> Please reword. What is the problem? >> >>> - Another case could be: iommu-less-pmd-driver. eal layer to do >>> parsing before updating drv->kdrv. >> >> [...] >>> >> >> > If a check is needed, I would prefer using your function >>> >> >> > pci_vfio_is_noiommu() and remove driver modes from struct >>> >> >> > rte_kernel_driver. >>> >> >> >>> >> >> I don't think calling pci_vfio_no_iommu() inside >>> >> >> virtio_reg_rd/wr_1/2/3() would be a good idea. >>> >> > >>> >> > Why? The value may be cached in the priv properties. >>> >> > >>> >> pci_vfio_is_noiommu() parses /sys for >>> >> - enable_noiommu param >>> >> - attached driver name is vfio-noiommu or not. >>> >> >>> >> It does file operation for that, I meant to say that calling this api >>> >> within register_rd/wr function is not correct. It would be better if >>> >> those low level register_rd/wr api only checks driver_types. >>> > >>> > Yes, that's why I said the return of pci_vfio_is_noiommu() may be cached >>> > to keep efficiency. >>> >>> I am not convinced though, Still find pmd driver checking driver_types >>> using drv->kdrv is better approach than introducing a new global >>> variable which may look something like; >> >> Not a global variable. A function in EAL layer. A variable in PMD priv. >> > > If we agreed to use condition (drv->kdrv == RTE_KDRV_VFIO); > then resource parsing for vfio {including vfio and vfio_noiommu both > case} is enforced in virtio pmd driver layer and that is contradicting > to what we agreed earlier in this[1] thread. Also we don't need a > function in EAL layer or a variable in PMD priv. Perhaps a private > function in virtio pmd which does parsing for vfio interface. > > Thoughts? > > [1] http://dpdk.org/dev/patchwork/patch/9862/ > Any comment/feedback on above approach? >>> At pci_eal layer >>> bool vfio_mode; >>> vfio_mode = pci_vfio_is_noiommu(); >>> >>> At virtio pmd driver layer >>> Checking
[dpdk-dev] [PATCH v2 0/2] minor cleanup in ethdev hotplug
2016-01-22 15:06, David Marchand: > It was first a preparation step for future patchsets, but I am not sure what > will become of them, so sending this anyway since it does not hurt to clean > this now. > > Changes since v1: > - rebased on HEAD (previous patchset was based on another patch I sent > separately) > - restored EINVAL error code for rte_eth_dev_(at|de)tach (thanks Jan) Applied, thanks
[dpdk-dev] [PATCH v5 10/11] virtio: pci: add dummy func definition for in/outb for non-x86 arch
Ping? On Tue, Jan 19, 2016 at 5:16 PM, Santosh Shukla wrote: > For non-x86 arch, Compiler will throw build error for in/out apis. Including > dummy api function so to pass build. > > Note that: For virtio to work for non-x86 arch - RTE_EAL_VFIO is the only > supported method. RTE_EAL_IGB_UIO is not supported for non-x86 arch. > > So, Virtio support for arch and supported interface by that arch: > > ARCH IGB_UIO VFIO > x86 Y Y > ARM64 N/A Y > PPC_64 N/A Y (Not tested but likely should work, as vfio is > arch independent) > > Note: Applicable for virtio spec 0.95 > > Signed-off-by: Santosh Shukla > --- > drivers/net/virtio/virtio_pci.h | 46 > +++ > 1 file changed, 46 insertions(+) > > diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h > index f550d22..b88f9ec 100644 > --- a/drivers/net/virtio/virtio_pci.h > +++ b/drivers/net/virtio/virtio_pci.h > @@ -46,6 +46,7 @@ > #endif > > #include > +#include "virtio_logs.h" > > struct virtqueue; > > @@ -320,6 +321,51 @@ outl_p(unsigned int data, unsigned int port) > } > #endif > > +#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_I686) && \ > + defined(RTE_EXEC_ENV_LINUXAPP) > +static inline uint8_t > +inb(unsigned long addr __rte_unused) > +{ > + PMD_INIT_LOG(ERR, "inb() not supported for this RTE_ARCH\n"); > + return 0; > +} > + > +static inline uint16_t > +inw(unsigned long addr __rte_unused) > +{ > + PMD_INIT_LOG(ERR, "inw() not supported for this RTE_ARCH\n"); > + return 0; > +} > + > +static inline uint32_t > +inl(unsigned long addr __rte_unused) > +{ > + PMD_INIT_LOG(ERR, "in() not supported for this RTE_ARCH\n"); > + return 0; > +} > + > +static inline void > +outb_p(unsigned char data __rte_unused, unsigned int port __rte_unused) > +{ > + PMD_INIT_LOG(ERR, "outb_p() not supported for this RTE_ARCH\n"); > + return; > +} > + > +static inline void > +outw_p(unsigned short data __rte_unused, unsigned int port __rte_unused) > +{ > + PMD_INIT_LOG(ERR, "outw_p() not supported for this RTE_ARCH\n"); > + return; > +} > + > +static inline void > +outl_p(unsigned int data __rte_unused, unsigned int port __rte_unused) > +{ > + PMD_INIT_LOG(ERR, "outl_p() not supported for this RTE_ARCH\n"); > + return; > +} > +#endif > + > static inline int > vtpci_with_feature(struct virtio_hw *hw, uint64_t bit) > { > -- > 1.7.9.5 >
[dpdk-dev] [PATCH v4] vfio: Support for no-IOMMU mode
Hi Thomas, > > +/* DMA mapping function prototype. > > + * Takes VFIO container fd as a parameter. > > + * Returns 0 on success, -1 on error. > > + * */ > > +typedef int (*vfio_dma_func_t)(int); > > + > > +struct vfio_iommu_type { > > + int type_id; > > + const char *name; > > + vfio_dma_func_t dma_map_func; > > +}; > > + > > +int vfio_iommu_type1_dma_map(int); > > +int vfio_iommu_noiommu_dma_map(int); > > Is it possible (is it better) to declare these functions with vfio_dma_func_t? Yeah, sure. Or maybe the other way around - maybe we could do away with the typedef. I'll go for the former though. > vfio_iommu_noiommu_dma_map is a weird name. > Why not vfio_noiommu_dma_map or vfio_iommu_none_dma_map? Well, the NOIOMMU type is named VFIO_IOMMU_NOIOMMU in the VFIO headers. So it's consistent with the IOMMU type name. Although vfio_noiommu_dma_map seems reasonable. Thanks, Anatoly
[dpdk-dev] [PATCH V1 1/1] jobstats: added function abort for job
> -Original Message- > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Panu Matilainen > Sent: Wednesday, January 27, 2016 2:38 PM > To: Kerlin, MarcinX ; dev at dpdk.org > Subject: Re: [dpdk-dev] [PATCH V1 1/1] jobstats: added function abort for job > > On 01/26/2016 06:15 PM, Marcin Kerlin wrote: > > This patch adds new function rte_jobstats_abort. It marks *job* as finished > > and time of this work will be add to management time instead of execution > time. > > This function should be used instead of rte_jobstats_finish if condition > occure, > > condition is defined by the application for example when receiving n>0 > packets. > > > > Signed-off-by: Marcin Kerlin > > --- > > lib/librte_jobstats/rte_jobstats.c | 22 ++ > > lib/librte_jobstats/rte_jobstats.h | 17 + > > lib/librte_jobstats/rte_jobstats_version.map | 7 +++ > > 3 files changed, 46 insertions(+) > > > [...] > > diff --git a/lib/librte_jobstats/rte_jobstats.h > b/lib/librte_jobstats/rte_jobstats.h > > index de6a89a..9995319 100644 > > --- a/lib/librte_jobstats/rte_jobstats.h > > +++ b/lib/librte_jobstats/rte_jobstats.h > > @@ -90,6 +90,9 @@ struct rte_jobstats { > > uint64_t exec_cnt; > > /**< Execute count. */ > > > > + uint64_t last_job_time; > > + /**< Last job time */ > > + > > char name[RTE_JOBSTATS_NAMESIZE]; > > /**< Name of this job */ > > > > AFAICS this is an ABI break and as such, needs to be preannounced, see > http://dpdk.org/doc/guides/contributing/versioning.html > For 2.3 it'd need to be a CONFIG_RTE_NEXT_ABI feature. > > - Panu - Hi Panu, Thanks for Your notice. This last_job_time field is actually not necessary here and will be removed from this structure. Best regards Michal
[dpdk-dev] [PATCH v6 1/2] mbuf: provide rte_pktmbuf_alloc_bulk API
On 01/26/2016 07:03 PM, Huawei Xie wrote: > v6 changes: > reflect the changes in release notes and library version map file > revise our duff's code style a bit to make it more readable > > v5 changes: > add comment about duff's device and our variant implementation > > v3 changes: > move while after case 0 > add context about duff's device and why we use while loop in the commit > message > > v2 changes: > unroll the loop a bit to help the performance > > rte_pktmbuf_alloc_bulk allocates a bulk of packet mbufs. > > There is related thread about this bulk API. > http://dpdk.org/dev/patchwork/patch/4718/ > Thanks to Konstantin's loop unrolling. > > Attached the wiki page about duff's device. It explains the performance > optimization through loop unwinding, and also the most dramatic use of > case label fall-through. > https://en.wikipedia.org/wiki/Duff%27s_device > > In our implementation, we use while() loop rather than do{} while() loop > because we could not assume count is strictly positive. Using while() > loop saves one line of check if count is zero. > > Signed-off-by: Gerald Rogers > Signed-off-by: Huawei Xie > Acked-by: Konstantin Ananyev > --- > doc/guides/rel_notes/release_2_3.rst | 3 ++ > lib/librte_mbuf/rte_mbuf.h | 55 > > lib/librte_mbuf/rte_mbuf_version.map | 7 + > 3 files changed, 65 insertions(+) > > diff --git a/doc/guides/rel_notes/release_2_3.rst > b/doc/guides/rel_notes/release_2_3.rst > index 99de186..a52cba3 100644 > --- a/doc/guides/rel_notes/release_2_3.rst > +++ b/doc/guides/rel_notes/release_2_3.rst > @@ -4,6 +4,9 @@ DPDK Release 2.3 > New Features > > > +* **Enable bulk allocation of mbufs. ** > + A new function ``rte_pktmbuf_alloc_bulk()`` has been added to allow the > user > + to allocate a bulk of mbufs. > > Resolved Issues > --- > diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h > index f234ac9..b2ed479 100644 > --- a/lib/librte_mbuf/rte_mbuf.h > +++ b/lib/librte_mbuf/rte_mbuf.h > @@ -1336,6 +1336,61 @@ static inline struct rte_mbuf > *rte_pktmbuf_alloc(struct rte_mempool *mp) > } > > /** > + * Allocate a bulk of mbufs, initialize refcnt and reset the fields to > default > + * values. > + * > + * @param pool > + *The mempool from which mbufs are allocated. > + * @param mbufs > + *Array of pointers to mbufs > + * @param count > + *Array size > + * @return > + * - 0: Success > + */ > +static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool, > + struct rte_mbuf **mbufs, unsigned count) > +{ > + unsigned idx = 0; > + int rc; > + > + rc = rte_mempool_get_bulk(pool, (void **)mbufs, count); > + if (unlikely(rc)) > + return rc; > + > + /* To understand duff's device on loop unwinding optimization, see > + * https://en.wikipedia.org/wiki/Duff's_device. > + * Here while() loop is used rather than do() while{} to avoid extra > + * check if count is zero. > + */ > + switch (count % 4) { > + case 0: > + while (idx != count) { > + RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0); > + rte_mbuf_refcnt_set(mbufs[idx], 1); > + rte_pktmbuf_reset(mbufs[idx]); > + idx++; > + case 3: > + RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0); > + rte_mbuf_refcnt_set(mbufs[idx], 1); > + rte_pktmbuf_reset(mbufs[idx]); > + idx++; > + case 2: > + RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0); > + rte_mbuf_refcnt_set(mbufs[idx], 1); > + rte_pktmbuf_reset(mbufs[idx]); > + idx++; > + case 1: > + RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(mbufs[idx]) == 0); > + rte_mbuf_refcnt_set(mbufs[idx], 1); > + rte_pktmbuf_reset(mbufs[idx]); > + idx++; > + } > + } > + return 0; > +} > + > +/** >* Attach packet mbuf to another packet mbuf. >* >* After attachment we refer the mbuf we attached as 'indirect', > diff --git a/lib/librte_mbuf/rte_mbuf_version.map > b/lib/librte_mbuf/rte_mbuf_version.map > index e10f6bd..257c65a 100644 > --- a/lib/librte_mbuf/rte_mbuf_version.map > +++ b/lib/librte_mbuf/rte_mbuf_version.map > @@ -18,3 +18,10 @@ DPDK_2.1 { > rte_pktmbuf_pool_create; > > } DPDK_2.0; > + > +DPDK_2.3 { > + global: > + > + rte_pktmbuf_alloc_bulk; > + > +} DPDK_2.1; > Since rte_pktmbuf_alloc_bulk() is an inline function, it is not part of the library ABI and should not be listed in the version map. I assume its inline for performance reasons, but then you lose the benefits of dynamic linking such as ability to fix bugs and/or improve itby just updating the library. Since the point of
[dpdk-dev] Errors Rx count increasing while pktgen doing nothing on Intel 82598EB 10G
Laurent, have you resolved this problem? I'm using the same NIC as yours (i.e. Intel 82598EB 10G NIC) and faced the same problem as you. Here is parts of my log and it says that PMD cannot enable RX queue for my NIC. I'm using DPDK 2.2.0 and used 'null' for the 4th parameter in calling rte_eth_rx_queue_setup(). (i.e. 'null' parameter provides the default rx_conf value.) Thanks. APP: initialising port 0 ... PMD: ixgbe_dev_rx_queue_setup(): sw_ring=0x7f5f27258040 sw_sc_ring=0x7f5f27257b00 hw_ring=0x7f5f27258580 dma_addr=0x41f458580 PMD: ixgbe_dev_tx_queue_setup(): sw_ring=0x7f5f27245940 hw_ring=0x7f5f27247980 dma_addr=0x41f447980 PMD: ixgbe_set_tx_function(): Using simple tx code path PMD: ixgbe_set_tx_function(): Vector tx enabled. PMD: ixgbe_dev_tx_queue_setup(): sw_ring=0x7f5f272337c0 hw_ring=0x7f5f27235800 dma_addr=0x41f435800 PMD: ixgbe_set_tx_function(): Using simple tx code path PMD: ixgbe_set_tx_function(): Vector tx enabled. PMD: ixgbe_set_rx_function(): Vector rx enabled, please make sure RX burst size no less than 4 (port=0). *PMD: ixgbe_dev_rx_queue_start(): Could not enable Rx Queue 0* APP: port 0 has started APP: port 0 has entered in promiscuous mode APP: port 0 initialization is done. KNI: pci: 09:00:00 8086:10c7 APP: kni allocation is done for port 0. APP: initialising port 1 ... PMD: ixgbe_dev_rx_queue_setup(): sw_ring=0x7f5f27222dc0 sw_sc_ring=0x7f5f27222880 hw_ring=0x7f5f27223300 dma_addr=0x41f423300 PMD: ixgbe_dev_tx_queue_setup(): sw_ring=0x7f5f272106c0 hw_ring=0x7f5f27212700 dma_addr=0x41f412700 PMD: ixgbe_set_tx_function(): Using simple tx code path PMD: ixgbe_set_tx_function(): Vector tx enabled. PMD: ixgbe_dev_tx_queue_setup(): sw_ring=0x7f5f271fe540 hw_ring=0x7f5f27200580 dma_addr=0x41f400580 PMD: ixgbe_set_tx_function(): Using simple tx code path PMD: ixgbe_set_tx_function(): Vector tx enabled. PMD: ixgbe_set_rx_function(): Vector rx enabled, please make sure RX burst size no less than 4 (port=1). *PMD: ixgbe_dev_rx_queue_start(): Could not enable Rx Queue 0* APP: port 1 has started APP: port 1 has entered in promiscuous mode APP: port 1 initialization is done. KNI: pci: 0a:00:00 8086:10c7 APP: kni allocation is done for port 1. checking link status .done Port 0 Link Up - speed 1 Mbps - full-duplex Port 1 Link Up - speed 1 Mbps - full-duplex On Mon, Dec 28, 2015 at 5:28 AM, Wiles, Keith wrote: > On 12/27/15, 2:09 PM, "Laurent GUERBY" wrote: > > >On Sun, 2015-12-27 at 19:43 +, Wiles, Keith wrote: > >> On 12/27/15, 12:31 PM, "dev on behalf of Laurent GUERBY" < > dev-bounces at dpdk.org on behalf of laurent at guerby.net> wrote: > >> > >> >Hi, > >> > > >> >I reported today an issue when using Pktgen-DPDK: > >> >https://github.com/pktgen/Pktgen-DPDK/issues/52 > >> > > >> >But I think it's more in DPDK than pktgen > >> > > >> >two identical machines with SFP+ DA cable between them > >> >DPDK 2.2.0 from tarball > >> >Pktgen-DPDK from git > >> >two identical machines: > >> >core i7 2600 (sandy bridge 4C/8T), HT disabled in the BIOS > >> >ASUS P8H67-M PRO BIOS 3904 (latest available) > >> >Ethernet controller: Intel Corporation 82598EB 10-Gigabit AF Dual Port > >> >Network Connection (rev 01) > >> >01:00.0 0200: 8086:10f1 (rev 01) > >> >Subsystem: 8086:a21f > >> >boot kernel 3.16 unbutu 14.04 with isolcpus=2,3,4 > >> > > >> >When launching pktgen even with no TX asked the Errors RX counters > keeps > >> >going up by about 7.4 millions per second: > >> > > >> >Errors Rx/Tx : 7471857054/0 > >> > > >> >In the log I get "Could not enable Rx Queue", might be the > >> >source of the issue? > >> > > >> >PMD: ixgbe_dev_rx_queue_start(): Could not enable Rx Queue 0 > >> >PMD: ixgbe_dev_rx_queue_start(): Could not enable Rx Queue 1 > >> > > >> >When sending traffic single UDP src/dst/IP/MAC the setup > >> >reaches 14204188 pps 64 bytes, the error counter is also > >> >increasing. > >> > > >> >Any idea what to look for? > >> > >> One more suggestion is to run test_pmd on one machine and something > >> like iperf on the other to verify the DPDK is working correct, which I > >> assume will be true. Not sure the RX errors are reported in the > >> test_pmd or you could use the l3fwd application too. > > > >Ok, I will check the test_pmd documentation and try to do this test: I'm > >just starting on DPDK :). > > > >> Please also send me the 'lspci | grep Ethernet? output. > > > >I included one line in my original email above (plus extract of lspci > >-vn), here is the full output of the command: > > > >01:00.0 Ethernet controller: Intel Corporation 82598EB 10-Gigabit AF > >Dual Port Network Connection (rev 01) > >01:00.1 Ethernet controller: Intel Corporation 82598EB 10-Gigabit AF > >Dual Port Network Connection (rev 01) > >05:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. > >RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller (rev 06) > > > >(The realtek is used only for internet connectivity). > > > >> Also send me the command line. > > > >On the first
[dpdk-dev] [PATCH V1 1/1] jobstats: added function abort for job
On 01/26/2016 06:15 PM, Marcin Kerlin wrote: > This patch adds new function rte_jobstats_abort. It marks *job* as finished > and time of this work will be add to management time instead of execution > time. > This function should be used instead of rte_jobstats_finish if condition > occure, > condition is defined by the application for example when receiving n>0 > packets. > > Signed-off-by: Marcin Kerlin > --- > lib/librte_jobstats/rte_jobstats.c | 22 ++ > lib/librte_jobstats/rte_jobstats.h | 17 + > lib/librte_jobstats/rte_jobstats_version.map | 7 +++ > 3 files changed, 46 insertions(+) > [...] > diff --git a/lib/librte_jobstats/rte_jobstats.h > b/lib/librte_jobstats/rte_jobstats.h > index de6a89a..9995319 100644 > --- a/lib/librte_jobstats/rte_jobstats.h > +++ b/lib/librte_jobstats/rte_jobstats.h > @@ -90,6 +90,9 @@ struct rte_jobstats { > uint64_t exec_cnt; > /**< Execute count. */ > > + uint64_t last_job_time; > + /**< Last job time */ > + > char name[RTE_JOBSTATS_NAMESIZE]; > /**< Name of this job */ > AFAICS this is an ABI break and as such, needs to be preannounced, see http://dpdk.org/doc/guides/contributing/versioning.html For 2.3 it'd need to be a CONFIG_RTE_NEXT_ABI feature. - Panu -
[dpdk-dev] [PATCH] no need to test for NULL when freeing
2016-01-21 12:23, David Marchand: > free() already handles NULL pointer. > > Signed-off-by: David Marchand Applied, thanks
[dpdk-dev] [PATCH] rte.extvars.mk: allow overriding RTE_SDK_BIN from the environment
2016-01-20 21:15, Matthew Hall: > On 1/20/16 7:27 AM, Thomas Monjalon wrote: > > Hi Matthew, > > > > RTE_SDK_BIN is an internal variable and should not be overriden. > > > > Have you installed DPDK somewhere? Example: > > make install O=mybuild DESTDIR=mylocalinstall > > > > Then you should build your app like this: > > make RTE_SDK=$(readlink -e ../dpdk/mylocalinstall/usr/local/share/dpdk) > > Hello Thomas, > > Is the way the make install target really works documented somewhere? It is poorly described here: http://dpdk.org/doc/guides/prog_guide/dev_kit_root_make_help.html#install-targets > This target did not exist when I first used DPDK in 2011, and since then > I saw various documentation on building DPDK in various places, but not > that much explanation what make install actually does. I recall various > list threads about changing its behavior as well. Historically, "make install" was a convenient default build (with T= option). The DESTDIR option was added to make a real install after building. The standard form (without T=) is now implemented to do a real install. > For example, if I look at this apparently most official document: > > http://dpdk.org/doc/guides/linux_gsg/build_dpdk.html > > It has build examples such as: > > make install T=x86_64-native-linuxapp-gcc This command finishes with this message: Installation cannot run with T defined and DESTDIR undefined Yes you are right, some docs are neither complete nor up-to-date. Volunteers are welcome. > But it does not discuss "O=" or "DESTDIR=" or any other additional > options. From some experiments on my machine, it looks like maybe I > could do this: > > make install "T=${RTE_TARGET}" "O=build" "DESTDIR=build" > > Is that a valid possibility, to keep it all in one easy directory? Yes you can install where you want. Note that this command (with T= and O=) will build in the directory $O/$T i.e. build/${RTE_TARGET} and install in build/ Please confirm that this patch is not needed. Thanks
[dpdk-dev] [PATCH 4/5] vhost: do not use rte_memcpy for virtio_hdr copy
On Wed, Jan 27, 2016 at 06:16:37AM +, Xie, Huawei wrote: > On 1/27/2016 2:02 PM, Yuanhan Liu wrote: > > On Wed, Jan 27, 2016 at 05:56:56AM +, Xie, Huawei wrote: > >> On 1/27/2016 11:22 AM, Yuanhan Liu wrote: > >>> On Wed, Jan 27, 2016 at 02:46:39AM +, Xie, Huawei wrote: > On 12/3/2015 2:03 PM, Yuanhan Liu wrote: > > + if (vq->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) { > > + *(struct virtio_net_hdr_mrg_rxbuf > > *)(uintptr_t)desc_addr = hdr; > > + } else { > > + *(struct virtio_net_hdr *)(uintptr_t)desc_addr = > > hdr.hdr; > > + } > Thanks! > We might simplify this further. Just reset the first two fields flags > and gso_type. > >>> What's this "simplification" for? Don't even to say that we will add > >>> TSO support, which modifies few more files, such as csum_start: reseting > >>> the first two fields only is wrong here. > >> I know TSO before commenting, but at least in this implementation and > >> this specific patch, i guess zeroing two fields are enough. > >> > >> What is wrong resetting only two fields? > > I then have to ask "What is the benifit of resetting only two fields"? > > If doing so, we have to change it back for TSO. My proposal requires no > > extra change when adding TSO support. > > ? Benefit is we save four unnecessary stores. Hmm..., the hdr size is 12 bytes at most. I mean, does it really matter, coping 3 bytes, or coping 12 bytes in a row? --yliu
[dpdk-dev] [PATCH v4] vfio: Support for no-IOMMU mode
This commit is adding a generic mechanism to support multiple IOMMU types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special VFIO mode that doesn't use IOMMU at all), but it's easily extended by adding necessary definitions into eal_pci_init.h and a DMA mapping function to eal_pci_vfio.c. Since type 1 IOMMU module is no longer necessary to have VFIO, we fix the module check to check for vfio-pci instead. It's not ideal and triggers VFIO checks more often (and thus produces more error output, which was the reason behind the module check in the first place), so we compensate for that by providing more verbose logging, indicating whether VFIO initialization has succeeded or failed. Signed-off-by: Anatoly Burakov Signed-off-by: Santosh Shukla Tested-by: Santosh Shukla --- v4 changes: Fixed the commit message and added a missing sign-off v3 changes: Merging DMA mapping functions back into eal_pci_vfio.c Fixing and adding comments v2 changes: Compile fix (hat-tip to Santosh Shukla) Tested-by is provisional, since only superficial testing was done lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 205 + lib/librte_eal/linuxapp/eal/eal_vfio.h | 5 + 2 files changed, 157 insertions(+), 53 deletions(-) diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c index 74f91ba..fdf334b 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c @@ -72,11 +72,74 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq) #define VFIO_DIR "/dev/vfio" #define VFIO_CONTAINER_PATH "/dev/vfio/vfio" #define VFIO_GROUP_FMT "/dev/vfio/%u" +#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u" #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL) /* per-process VFIO config */ static struct vfio_config vfio_cfg; +/* DMA mapping function prototype. + * Takes VFIO container fd as a parameter. + * Returns 0 on success, -1 on error. + * */ +typedef int (*vfio_dma_func_t)(int); + +struct vfio_iommu_type { + int type_id; + const char *name; + vfio_dma_func_t dma_map_func; +}; + +int vfio_iommu_type1_dma_map(int); +int vfio_iommu_noiommu_dma_map(int); + +/* IOMMU types we support */ +static const struct vfio_iommu_type iommu_types[] = { + /* x86 IOMMU, otherwise known as type 1 */ + { VFIO_TYPE1_IOMMU, "Type 1", _iommu_type1_dma_map}, + /* IOMMU-less mode */ + { VFIO_NOIOMMU_IOMMU, "No-IOMMU", _iommu_noiommu_dma_map}, +}; + +int +vfio_iommu_type1_dma_map(int vfio_container_fd) +{ + const struct rte_memseg *ms = rte_eal_get_physmem_layout(); + int i, ret; + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + struct vfio_iommu_type1_dma_map dma_map; + + if (ms[i].addr == NULL) + break; + + memset(_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = ms[i].addr_64; + dma_map.size = ms[i].len; + dma_map.iova = ms[i].phys_addr; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, _map); + + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +int +vfio_iommu_noiommu_dma_map(int __rte_unused vfio_container_fd) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + int pci_vfio_read_config(const struct rte_intr_handle *intr_handle, void *buf, size_t len, off_t offs) @@ -208,42 +271,58 @@ pci_vfio_set_bus_master(int dev_fd) return 0; } -/* set up DMA mappings */ -static int -pci_vfio_setup_dma_maps(int vfio_container_fd) -{ - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - int i, ret; - - ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, - VFIO_TYPE1_IOMMU); - if (ret) { - RTE_LOG(ERR, EAL, " cannot set IOMMU type, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; +/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ +static const struct vfio_iommu_type * +pci_vfio_set_iommu_type(int vfio_container_fd) { + unsigned idx; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = _types[idx]; + + int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, + t->type_id); + if (!ret) { + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", +
[dpdk-dev] [PATCH v3] vfio: Support for no-IOMMU mode
Apologies, lost the signoff from Santosh Shukla and also the commit message still mentions the file that is now non-existent, so I'll submit a v4. Thanks, Anatoly > -Original Message- > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Anatoly Burakov > Sent: Wednesday, January 27, 2016 2:05 PM > To: dev at dpdk.org > Subject: [dpdk-dev] [PATCH v3] vfio: Support for no-IOMMU mode > > This commit is adding a generic mechanism to support multiple IOMMU > types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special VFIO > mode that doesn't use IOMMU at all), but it's easily extended by adding > necessary definitions into eal_pci_init.h and a DMA mapping function to > eal_pci_vfio_dma.c. > > Since type 1 IOMMU module is no longer necessary to have VFIO, we fix the > module check to check for vfio-pci instead. It's not ideal and triggers VFIO > checks more often (and thus produces more error output, which was the > reason behind the module check in the first place), so we compensate for > that by providing more verbose logging, indicating whether VFIO initialization > has succeeded or failed. > > Signed-off-by: Anatoly Burakov > Tested-by: Santosh Shukla > --- > v3 changes: > Merging DMA mapping functions back into eal_pci_vfio.c > Fixing and adding comments > > v2 changes: > Compile fix (hat-tip to Santosh Shukla) > Tested-by is provisional, since only superficial testing was done > > lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 205 +-- > -- > lib/librte_eal/linuxapp/eal/eal_vfio.h | 5 + > 2 files changed, 157 insertions(+), 53 deletions(-) > > diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c > b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c > index 74f91ba..fdf334b 100644 > --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c > +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c > @@ -72,11 +72,74 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq) > #define VFIO_DIR "/dev/vfio" > #define VFIO_CONTAINER_PATH "/dev/vfio/vfio" > #define VFIO_GROUP_FMT "/dev/vfio/%u" > +#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u" > #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL) > > /* per-process VFIO config */ > static struct vfio_config vfio_cfg; > > +/* DMA mapping function prototype. > + * Takes VFIO container fd as a parameter. > + * Returns 0 on success, -1 on error. > + * */ > +typedef int (*vfio_dma_func_t)(int); > + > +struct vfio_iommu_type { > + int type_id; > + const char *name; > + vfio_dma_func_t dma_map_func; > +}; > + > +int vfio_iommu_type1_dma_map(int); > +int vfio_iommu_noiommu_dma_map(int); > + > +/* IOMMU types we support */ > +static const struct vfio_iommu_type iommu_types[] = { > + /* x86 IOMMU, otherwise known as type 1 */ > + { VFIO_TYPE1_IOMMU, "Type 1", > _iommu_type1_dma_map}, > + /* IOMMU-less mode */ > + { VFIO_NOIOMMU_IOMMU, "No-IOMMU", > _iommu_noiommu_dma_map}, }; > + > +int > +vfio_iommu_type1_dma_map(int vfio_container_fd) { > + const struct rte_memseg *ms = rte_eal_get_physmem_layout(); > + int i, ret; > + > + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ > + for (i = 0; i < RTE_MAX_MEMSEG; i++) { > + struct vfio_iommu_type1_dma_map dma_map; > + > + if (ms[i].addr == NULL) > + break; > + > + memset(_map, 0, sizeof(dma_map)); > + dma_map.argsz = sizeof(struct > vfio_iommu_type1_dma_map); > + dma_map.vaddr = ms[i].addr_64; > + dma_map.size = ms[i].len; > + dma_map.iova = ms[i].phys_addr; > + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | > VFIO_DMA_MAP_FLAG_WRITE; > + > + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, > _map); > + > + if (ret) { > + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, > " > + "error %i (%s)\n", errno, > strerror(errno)); > + return -1; > + } > + } > + > + return 0; > +} > + > +int > +vfio_iommu_noiommu_dma_map(int __rte_unused vfio_container_fd) { > + /* No-IOMMU mode does not need DMA mapping */ > + return 0; > +} > + > int > pci_vfio_read_config(const struct rte_intr_handle *intr_handle, > void *buf, size_t len, off_t offs) @@ -208,42 +271,58 @@ > pci_vfio_set_bus_master(int dev_fd) > return 0; > } > > -/* set up DMA mappings */ > -static int > -pci_vfio_setup_dma_maps(int vfio_container_fd) -{ > - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); > - int i, ret; > - > - ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, > - VFIO_TYPE1_IOMMU); > - if (ret) { > - RTE_LOG(ERR, EAL, " cannot set IOMMU type, " > - "error %i (%s)\n", errno, strerror(errno)); > - return -1; > +/* pick IOMMU type. returns a pointer to vfio_iommu_type
[dpdk-dev] [PATCH 1/5] vhost: refactor rte_vhost_dequeue_burst
On Wed, Jan 27, 2016 at 06:12:22AM +, Xie, Huawei wrote: > On 1/27/2016 11:26 AM, Yuanhan Liu wrote: > > On Tue, Jan 26, 2016 at 10:30:12AM +, Xie, Huawei wrote: > >> On 12/3/2015 2:03 PM, Yuanhan Liu wrote: > >>> Signed-off-by: Yuanhan Liu > >>> --- > >>> lib/librte_vhost/vhost_rxtx.c | 287 > >>> +- > >>> 1 file changed, 113 insertions(+), 174 deletions(-) > >> Prefer to unroll copy_mbuf_to_desc and your COPY macro. It prevents us > > I'm okay to unroll COPY macro. But for copy_mbuf_to_desc, I prefer not > > to do that, unless it has a good reason. > > > >> processing descriptors in a burst way in future. > > So, do you have a plan? > > I think it is OK. If we need unroll in future, we could do that then. I > am open to this. Just my preference. I understand that wrapping makes > code more readable. Okay, let's consider it then: unroll would be easy after all. --yliu
[dpdk-dev] [PATCH 0/9] pci cleanup and blacklist rework
On Fri, Jan 22, 2016 at 4:27 PM, David Marchand wrote: > The 4th patch introduces a change in linux eal. > Before, if a pci device was bound to no kernel driver, eal would set kdrv > to "unknown". With this change, kdrv is set to "none". > This might make it possible to avoid the old issue of virtio devices being > used by dpdk while still bound to kernel driver reported by Franck B.. > I'll let virtio guys look at this. > At the very least, it makes more sense to me. Ok, actually, I had forgotten that Huawei had already sent a similar change [1]. So I suppose this patch commitlog is wrong, but the patch itself is still worth for the cleanup. Thomas, I suppose you will integrate Huawei patches first. Then I will rebase and fix the commitlog. [1] http://dpdk.org/dev/patchwork/patch/9718/ -- David Marchand
[dpdk-dev] [PATCH v3] vfio: Support for no-IOMMU mode
This commit is adding a generic mechanism to support multiple IOMMU types. For now, it's only type 1 (x86 IOMMU) and no-IOMMU (a special VFIO mode that doesn't use IOMMU at all), but it's easily extended by adding necessary definitions into eal_pci_init.h and a DMA mapping function to eal_pci_vfio_dma.c. Since type 1 IOMMU module is no longer necessary to have VFIO, we fix the module check to check for vfio-pci instead. It's not ideal and triggers VFIO checks more often (and thus produces more error output, which was the reason behind the module check in the first place), so we compensate for that by providing more verbose logging, indicating whether VFIO initialization has succeeded or failed. Signed-off-by: Anatoly Burakov Tested-by: Santosh Shukla --- v3 changes: Merging DMA mapping functions back into eal_pci_vfio.c Fixing and adding comments v2 changes: Compile fix (hat-tip to Santosh Shukla) Tested-by is provisional, since only superficial testing was done lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 205 + lib/librte_eal/linuxapp/eal/eal_vfio.h | 5 + 2 files changed, 157 insertions(+), 53 deletions(-) diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c index 74f91ba..fdf334b 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c @@ -72,11 +72,74 @@ EAL_REGISTER_TAILQ(rte_vfio_tailq) #define VFIO_DIR "/dev/vfio" #define VFIO_CONTAINER_PATH "/dev/vfio/vfio" #define VFIO_GROUP_FMT "/dev/vfio/%u" +#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u" #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL) /* per-process VFIO config */ static struct vfio_config vfio_cfg; +/* DMA mapping function prototype. + * Takes VFIO container fd as a parameter. + * Returns 0 on success, -1 on error. + * */ +typedef int (*vfio_dma_func_t)(int); + +struct vfio_iommu_type { + int type_id; + const char *name; + vfio_dma_func_t dma_map_func; +}; + +int vfio_iommu_type1_dma_map(int); +int vfio_iommu_noiommu_dma_map(int); + +/* IOMMU types we support */ +static const struct vfio_iommu_type iommu_types[] = { + /* x86 IOMMU, otherwise known as type 1 */ + { VFIO_TYPE1_IOMMU, "Type 1", _iommu_type1_dma_map}, + /* IOMMU-less mode */ + { VFIO_NOIOMMU_IOMMU, "No-IOMMU", _iommu_noiommu_dma_map}, +}; + +int +vfio_iommu_type1_dma_map(int vfio_container_fd) +{ + const struct rte_memseg *ms = rte_eal_get_physmem_layout(); + int i, ret; + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + struct vfio_iommu_type1_dma_map dma_map; + + if (ms[i].addr == NULL) + break; + + memset(_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = ms[i].addr_64; + dma_map.size = ms[i].len; + dma_map.iova = ms[i].phys_addr; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, _map); + + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +int +vfio_iommu_noiommu_dma_map(int __rte_unused vfio_container_fd) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + int pci_vfio_read_config(const struct rte_intr_handle *intr_handle, void *buf, size_t len, off_t offs) @@ -208,42 +271,58 @@ pci_vfio_set_bus_master(int dev_fd) return 0; } -/* set up DMA mappings */ -static int -pci_vfio_setup_dma_maps(int vfio_container_fd) -{ - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - int i, ret; - - ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, - VFIO_TYPE1_IOMMU); - if (ret) { - RTE_LOG(ERR, EAL, " cannot set IOMMU type, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; +/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ +static const struct vfio_iommu_type * +pci_vfio_set_iommu_type(int vfio_container_fd) { + unsigned idx; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = _types[idx]; + + int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, + t->type_id); + if (!ret) { + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", + t->type_id, t->name); + return t; +
[dpdk-dev] [PATCH 4/5] vhost: do not use rte_memcpy for virtio_hdr copy
On Wed, Jan 27, 2016 at 05:56:56AM +, Xie, Huawei wrote: > On 1/27/2016 11:22 AM, Yuanhan Liu wrote: > > On Wed, Jan 27, 2016 at 02:46:39AM +, Xie, Huawei wrote: > >> On 12/3/2015 2:03 PM, Yuanhan Liu wrote: > >>> + if (vq->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) { > >>> + *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr; > >>> + } else { > >>> + *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr; > >>> + } > >> Thanks! > >> We might simplify this further. Just reset the first two fields flags > >> and gso_type. > > What's this "simplification" for? Don't even to say that we will add > > TSO support, which modifies few more files, such as csum_start: reseting > > the first two fields only is wrong here. > > I know TSO before commenting, but at least in this implementation and > this specific patch, i guess zeroing two fields are enough. > > What is wrong resetting only two fields? I then have to ask "What is the benifit of resetting only two fields"? If doing so, we have to change it back for TSO. My proposal requires no extra change when adding TSO support. --yliu
[dpdk-dev] [PATCH v2 4/4] virtio: check if any kernel driver is manipulating the virtio device
2016-01-07 16:17, Panu Matilainen: > On 01/03/2016 07:56 PM, Huawei Xie wrote: > > v2 changes: > > change LOG level from ERR to INFO > > > > virtio PMD could use IO port to configure the virtio device without > > using uio driver. > > > > There are two issues with previous implementation: > > 1) virtio PMD will take over each virtio device blindly even if some > > are not intended for DPDK. > > 2) driver conflict between virtio PMD and virtio-net kernel driver. > > > > This patch checks if there is any kernel driver manipulating the virtio > > device before virtio PMD uses IO port to configure the device. > > > > Fixes: da978dfdc43b ("virtio: use port IO to get PCI resource") > > > > Signed-off-by: Huawei Xie > > --- > > drivers/net/virtio/virtio_ethdev.c | 7 +++ > > 1 file changed, 7 insertions(+) > > > > diff --git a/drivers/net/virtio/virtio_ethdev.c > > b/drivers/net/virtio/virtio_ethdev.c > > index e815acd..7a50dac 100644 > > --- a/drivers/net/virtio/virtio_ethdev.c > > +++ b/drivers/net/virtio/virtio_ethdev.c > > @@ -1138,6 +1138,13 @@ static int virtio_resource_init_by_ioports(struct > > rte_pci_device *pci_dev) > > int found = 0; > > size_t linesz; > > > > + if (pci_dev->kdrv != RTE_KDRV_NONE) { > > + PMD_INIT_LOG(INFO, > > + "kernel driver is manipulating this device." \ > > + " Please unbind the kernel driver."); > > At the very least this message needs to be changed. > > Like said earlier, I think the message could just as well be dropped > entirely, but at least it should be something to the tune of "ignoring > kernel owned device" instead of asking the user to break their > configuration. Huawei, a v3 is required. Thanks
[dpdk-dev] [PATCH] vfio/noiommu: Don't use iommu_present() to track fake groups
Hi Alex, > On 01/23/2016 04:23 AM, Alex Williamson wrote: > > Using iommu_present() to determine whether an IOMMU group is real or > > fake has some problems. First, apparently Power systems don't > > register an IOMMU on the device bus, so the groups and containers get > > marked as noiommu and then won't bind to their actual IOMMU driver. > > Second, I expect we'll run into the same issue as we try to support > > vGPUs through vfio, since they're likely to emulate this behavior of > > creating an IOMMU group on a virtual device and then providing a vfio > > IOMMU backend tailored to the sort of isolation they provide, which > > won't necessarily be fully compatible with the IOMMU API. > > > > The solution here is to use the existing iommudata interface to IOMMU > > groups, which allows us to easily identify the fake groups we've > > created for noiommu purposes. The iommudata we set is purely > > arbitrary since we're only comparing the address, so we use the > > address of the noiommu switch itself. > > > > Reported-by: Alexey Kardashevskiy > > Fixes: 03a76b60f8ba ("vfio: Include No-IOMMU mode") > > Signed-off-by: Alex Williamson > > > > Reviewed-by: Alexey Kardashevskiy > Tested-by: Alexey Kardashevskiy Tested bringing the NIC's up, encountered no issues. Curious if it also works for Santosh (CC'd) as he's one of the intended users of the No-IOMMU functionality, but otherwise seems to work. Thanks, Anatoly
[dpdk-dev] [RFC] eal: add cgroup-aware resource self discovery
On Wed, Jan 27, 2016 at 08:02:27PM +0800, Tan, Jianfeng wrote: > Hi Neil, > > On 1/26/2016 10:19 PM, Neil Horman wrote: > >On Tue, Jan 26, 2016 at 10:22:18AM +0800, Tan, Jianfeng wrote: > >>Hi Neil, > >> > >>On 1/25/2016 9:46 PM, Neil Horman wrote: > >>>On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote: > >>... > -- > 2.1.4 > > > >>>This doesn't make a whole lot of sense, for several reasons: > >>> > >>>1) Applications, as a general rule shouldn't be interrogating the cgroups > >>>interface at all. > >>The main reason to do this in DPDK is that DPDK obtains resource information > >>from sysfs and proc, which are not well containerized so far. And DPDK > >>pre-allocates resource instead of on-demand gradual allocating. > >> > >Not disagreeing with this, just suggesting that: > > > >1) Interrogating cgroups really isn't the best way to collect that > >information > >2) Pre-allocating those resources isn't particularly wise without some > >mechanism > >to reallocate it, as resource constraints can change (consider your cpuset > >getting rewritten) > > In the case of reallocate, > For cpuset, DPDK panics in the initialization if set_affinity fails, but > after that, cpuset rewritten will not bring any problem I believe. Yes, that seems reasonable, but I think you need to update rte_thread_set_affinity to not assume that success in pthread_setaffinity_np means that all cpus in the provided mask are available. That is to say, cpusetp is subsequently stored in lore information after the set, but may not reflect the actual working set of processors, you should follow a successful set with a call to pthread_getaffinity_np to retrieve the actual working cpuset As for subsequent changes to the cpuset, I'm not sure how you want to handle that. I would think that you might want to run a check periodically or alow for a SIGHUP or some other signal to trigger a rescan of your working cpuset so as to keep the application in sync with the system. > For memory, a running application uses 2G hugepages, then admin decreases > hugetlb cgroup into 1G, the application will not get killed, unless it tries > to access more hugepages (I'll double check this). > No, the semantics should be identical to malloc/mmap (if you use the alloc_hugepages api or the mmap api). You should get a NULL return or other no fatal indicator if you allocate more than is available. > So another way to address this problem is to add an option that DPDK tries > best to allocate those resources, and if fails, it just posts a warning and > uses those allocated resources, instead of panic. What do you think? > Yes, that makes sense > > > >>>2) Cgroups aren't the only way in which a cpuset or memoryset can be > >>>restricted > >>>(the isolcpus command line argument, or a taskset on a parent process for > >>>instance, but there are several others). > >>Yes, I agree. To enable that, I'd like design the new API for resource self > >>discovery in a flexible way. A parameter "type" is used to specify the > >>solution to discovery way. In addition, I'm considering to add a callback > >>function pointer so that users can write their own resource discovery > >>functions. > >> > >Why? You don't need an API for this, or if you really want one, it can be > >very > >generic if you use POSIX apis to gather the information. What you have here > >is > >going to be very linux specific, and will need reimplementing for BSD or > >other > >operating systems. To use the cpuset example, instead of reading and parsing > >the mask files in the cgroup filesystem module to find your task and > >corresponding mask, just call sched_setaffinity with an all f's mask, then > >call > >sched_getaffinity. The returned mask will be all the cpus your process is > >allowed to execute on, taking into account every limiting filter the system > >you > >are running on offers. > > Yes, it makes sense on cpu's side. > > > > >There are simmilar OS level POSIX apis for most resources out there. You > >really > >don't need to dig through cgroups just to learn what some of those reources > >are. > > > >>>Instead of trying to figure out what cpuset is valid for your process by > >>>interrogating the cgroups heirarchy, instead you should follow the > >>>proscribed > >>>method of calling sched_getaffinity after calling sched_setaffinity. That > >>>will > >>>give you the canonical cpuset that you are executing on, taking all cpuset > >>>filters into account (including cgroups and any other restrictions). Its > >>>far > >>>simpler as well, as it doesn't require a ton of file/string processing. > >>Yes, this way is much better for cpuset discovery. But is there such a > >>syscall for hugepages? > >> > >In what capacity? Interrogating how many hugepages you have, or to what node > >they are affined to? Capacity would require reading the requisite proc > >file, as > >theres no posix api for this resource. Node affinity can be implied by >
[dpdk-dev] [PATCH v2 15/16] fm10k/base: move constants to the right of binary operators
The upstream Linux kernel community prefers constants are to the right of binary operators. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_pf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/fm10k/base/fm10k_pf.c b/drivers/net/fm10k/base/fm10k_pf.c index 456fe64..105babf 100644 --- a/drivers/net/fm10k/base/fm10k_pf.c +++ b/drivers/net/fm10k/base/fm10k_pf.c @@ -759,8 +759,8 @@ STATIC s32 fm10k_iov_assign_resources_pf(struct fm10k_hw *hw, u16 num_vfs, FM10K_RXDCTL_WRITE_BACK_MIN_DELAY | FM10K_RXDCTL_DROP_ON_EMPTY); FM10K_WRITE_REG(hw, FM10K_RXQCTL(vf_q_idx), - FM10K_RXQCTL_VF | - (i << FM10K_RXQCTL_VF_SHIFT)); + (i << FM10K_RXQCTL_VF_SHIFT) | + FM10K_RXQCTL_VF); /* map queue pair to VF */ FM10K_WRITE_REG(hw, FM10K_TQMAP(qmap_idx), vf_q_idx); @@ -1035,7 +1035,7 @@ STATIC s32 fm10k_iov_reset_resources_pf(struct fm10k_hw *hw, txqctl = ((u32)vf_vid << FM10K_TXQCTL_VID_SHIFT) | (vf_idx << FM10K_TXQCTL_TC_SHIFT) | FM10K_TXQCTL_VF | vf_idx; - rxqctl = FM10K_RXQCTL_VF | (vf_idx << FM10K_RXQCTL_VF_SHIFT); + rxqctl = (vf_idx << FM10K_RXQCTL_VF_SHIFT) | FM10K_RXQCTL_VF; /* stop further DMA and reset queue ownership back to VF */ for (i = vf_q_idx; i < (queues_per_pool + vf_q_idx); i++) { -- 1.9.3
[dpdk-dev] [PATCH v2 14/16] fm10k/base: TLV structures must be 4byte aligned, not 1byte aligned
Per comments from an upstream patch, and looking at how TLV LE_STRUCT code works, we actually want these structures to be 4byte aligned, not 1byte aligned. In practice, 1byte alignment has worked so far because all our structures end up being a multiple of 4. But if a future TLV structure were added that had a u8 or similar sticking on the end things would break. Fix this by using 4byte alignment which will prevent the TLV LE_STRUCT code from breaking. Update the comment explaining that we need 4byte alignment of our structures. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_pf.h | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/fm10k/base/fm10k_pf.h b/drivers/net/fm10k/base/fm10k_pf.h index 92e2962..ee8527a 100644 --- a/drivers/net/fm10k/base/fm10k_pf.h +++ b/drivers/net/fm10k/base/fm10k_pf.h @@ -91,14 +91,14 @@ enum fm10k_pf_tlv_attr_id_v1 { #define FM10K_MSG_UPDATE_PVID_PVID_SHIFT 16 #define FM10K_MSG_UPDATE_PVID_PVID_SIZE16 -/* The following data structures are overlayed specifically to TLV mailbox - * messages, and must not have gaps between their values. They must line up - * correctly to the TLV definition. +/* The following data structures are overlayed directly onto TLV mailbox + * messages, and must not break 4 byte alignment. Ensure the structures line + * up correctly as per their TLV definition. */ #ifdef C99 -#pragma pack(push, 1) +#pragma pack(push, 4) #else -#pragma pack(1) +#pragma pack(4) #endif /* C99 */ struct fm10k_mac_update { -- 1.9.3
[dpdk-dev] [PATCH v2 13/16] fm10k/base: fix comment per upstream review changes
The comment here was changed during review of upstream patch, and the new wording is slightly more clear. Re-write the comment in SHARED code based on this new wording. Fix a number of mailbox comment issues with function header comments, lower-case acronyms (i.e. FIFO, TLV), incorrect function names in DEBUGFUNC(), duplicate comments and a stubbed-out header comment for fm10k_sm_mbx_init. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_mbx.c | 61 ++ drivers/net/fm10k/base/fm10k_mbx.h | 4 +-- drivers/net/fm10k/base/fm10k_pf.c | 12 drivers/net/fm10k/base/fm10k_tlv.h | 4 +-- 4 files changed, 46 insertions(+), 35 deletions(-) diff --git a/drivers/net/fm10k/base/fm10k_mbx.c b/drivers/net/fm10k/base/fm10k_mbx.c index 7d03704..2e70434 100644 --- a/drivers/net/fm10k/base/fm10k_mbx.c +++ b/drivers/net/fm10k/base/fm10k_mbx.c @@ -70,7 +70,7 @@ STATIC u16 fm10k_fifo_unused(struct fm10k_mbx_fifo *fifo) } /** - * fm10k_fifo_empty - Test to verify if fifo is empty + * fm10k_fifo_empty - Test to verify if FIFO is empty * @fifo: pointer to FIFO * * This function returns true if the FIFO is empty, else false @@ -85,7 +85,7 @@ STATIC bool fm10k_fifo_empty(struct fm10k_mbx_fifo *fifo) * @fifo: pointer to FIFO * @offset: offset to add to head * - * This function returns the indices into the fifo based on head + offset + * This function returns the indices into the FIFO based on head + offset **/ STATIC u16 fm10k_fifo_head_offset(struct fm10k_mbx_fifo *fifo, u16 offset) { @@ -97,7 +97,7 @@ STATIC u16 fm10k_fifo_head_offset(struct fm10k_mbx_fifo *fifo, u16 offset) * @fifo: pointer to FIFO * @offset: offset to add to tail * - * This function returns the indices into the fifo based on tail + offset + * This function returns the indices into the FIFO based on tail + offset **/ STATIC u16 fm10k_fifo_tail_offset(struct fm10k_mbx_fifo *fifo, u16 offset) { @@ -173,7 +173,7 @@ STATIC u16 fm10k_mbx_index_len(struct fm10k_mbx_info *mbx, u16 head, u16 tail) /** * fm10k_mbx_tail_add - Determine new tail value with added offset * @mbx: pointer to mailbox - * @offset: length to add to head offset + * @offset: length to add to tail offset * * This function takes the local tail index and recomputes it for * a given length added as an offset. @@ -189,7 +189,7 @@ STATIC u16 fm10k_mbx_tail_add(struct fm10k_mbx_info *mbx, u16 offset) /** * fm10k_mbx_tail_sub - Determine new tail value with subtracted offset * @mbx: pointer to mailbox - * @offset: length to add to head offset + * @offset: length to add to tail offset * * This function takes the local tail index and recomputes it for * a given length added as an offset. @@ -253,7 +253,7 @@ STATIC u16 fm10k_mbx_pushed_tail_len(struct fm10k_mbx_info *mbx) } /** - * fm10k_fifo_write_copy - pulls data off of msg and places it in fifo + * fm10k_fifo_write_copy - pulls data off of msg and places it in FIFO * @fifo: pointer to FIFO * @msg: message array to populate * @tail_offset: additional offset to add to tail pointer @@ -331,7 +331,7 @@ STATIC u16 fm10k_mbx_validate_msg_size(struct fm10k_mbx_info *mbx, u16 len) u16 total_len = 0, msg_len; u32 *msg; - DEBUGFUNC("fm10k_mbx_validate_msg"); + DEBUGFUNC("fm10k_mbx_validate_msg_size"); /* length should include previous amounts pushed */ len += mbx->pushed; @@ -353,6 +353,7 @@ STATIC u16 fm10k_mbx_validate_msg_size(struct fm10k_mbx_info *mbx, u16 len) /** * fm10k_mbx_write_copy - pulls data off of Tx FIFO and places it in mbmem + * @hw: pointer to hardware structure * @mbx: pointer to mailbox * * This function will take a section of the Tx FIFO and copy it into the @@ -734,7 +735,7 @@ STATIC bool fm10k_mbx_tx_complete(struct fm10k_mbx_info *mbx) * @hw: pointer to hardware structure * @mbx: pointer to mailbox * - * This function dequeues messages and hands them off to the tlv parser. + * This function dequeues messages and hands them off to the TLV parser. * It will return the number of messages processed when called. **/ STATIC u16 fm10k_mbx_dequeue_rx(struct fm10k_hw *hw, @@ -951,7 +952,7 @@ STATIC void fm10k_mbx_create_fake_disconnect_hdr(struct fm10k_mbx_info *mbx) } /** - * fm10k_mbx_create_error_msg - Generate a error message + * fm10k_mbx_create_error_msg - Generate an error message * @mbx: pointer to mailbox * @err: local error encountered * @@ -984,7 +985,6 @@ STATIC void fm10k_mbx_create_error_msg(struct fm10k_mbx_info *mbx, s32 err) /** * fm10k_mbx_validate_msg_hdr - Validate common fields in the message header * @mbx: pointer to mailbox - * @msg: message array to read * * This function will parse up the fields in the mailbox header and return * an error if the header contains any of a number of invalid configurations @@ -1050,11 +1050,12 @@ STATIC s32
[dpdk-dev] [PATCH v2 12/16] fm10k/base: consistently use VLAN ID when referencing vid variables
The vid variable name is shorthand for VLAN ID, so we should use this in comments explaining what is happening. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_pf.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/fm10k/base/fm10k_pf.c b/drivers/net/fm10k/base/fm10k_pf.c index f5cbda4..716d7f1 100644 --- a/drivers/net/fm10k/base/fm10k_pf.c +++ b/drivers/net/fm10k/base/fm10k_pf.c @@ -970,7 +970,7 @@ err_out: txqctl |= (vf_idx << FM10K_TXQCTL_TC_SHIFT) | FM10K_TXQCTL_VF | vf_idx; - /* assign VID */ + /* assign VLAN ID */ for (i = 0; i < queues_per_pool; i++) FM10K_WRITE_REG(hw, FM10K_TXQCTL(vf_q_idx + i), txqctl); @@ -1215,12 +1215,12 @@ s32 fm10k_iov_msg_msix_pf(struct fm10k_hw *hw, u32 **results, } /** - * fm10k_iov_select_vid - Select correct default vid + * fm10k_iov_select_vid - Select correct default VLAN ID * @hw: Pointer to hardware structure - * @vid: vid to correct + * @vid: VLAN ID to correct * - * Will report an error if vid is out of range. For vid = 0, it will return - * either the pf_vid or sw_vid depending on which one is set. + * Will report an error if the VLAN ID is out of range. For VID = 0, it will + * return either the pf_vid or sw_vid depending on which one is set. */ STATIC s32 fm10k_iov_select_vid(struct fm10k_vf_info *vf_info, u16 vid) { @@ -1783,7 +1783,7 @@ static s32 fm10k_msg_update_pvid_pf(struct fm10k_hw *hw, u32 **results, if (!fm10k_glort_valid_pf(hw, glort)) return FM10K_ERR_PARAM; - /* verify VID is valid */ + /* verify VLAN ID is valid */ if (pvid >= FM10K_VLAN_TABLE_VID_MAX) return FM10K_ERR_PARAM; -- 1.9.3
[dpdk-dev] [PATCH v2 11/16] fm10k/base: allow removal of is_slot_appropriate function
The Linux Kernel provides the OS a call "pcie_get_minimum_link" which can crawl the PCIe tree and determine the actual minimum link speed of a device which is a more general check than provided by is_slot_appropriate. Thus, the upstream driver does not use or want the is_slot_appropriate function call. Add a NO_IS_SLOT_APPROPRIATE_CHECK definition which can be defined during strip process to remove the code. If left undefined (the default) then the code will all be active and no driver changes should be necessary. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_api.c | 2 ++ drivers/net/fm10k/base/fm10k_api.h | 2 ++ drivers/net/fm10k/base/fm10k_pf.c | 4 drivers/net/fm10k/base/fm10k_type.h | 2 ++ drivers/net/fm10k/base/fm10k_vf.c | 4 5 files changed, 14 insertions(+) diff --git a/drivers/net/fm10k/base/fm10k_api.c b/drivers/net/fm10k/base/fm10k_api.c index eb5bdaa..c49d20d 100644 --- a/drivers/net/fm10k/base/fm10k_api.c +++ b/drivers/net/fm10k/base/fm10k_api.c @@ -181,6 +181,7 @@ s32 fm10k_get_bus_info(struct fm10k_hw *hw) FM10K_NOT_IMPLEMENTED); } +#ifndef NO_IS_SLOT_APPROPRIATE_CHECK /** * fm10k_is_slot_appropriate - Indicate appropriate slot for this SKU * @hw: pointer to hardware structure @@ -195,6 +196,7 @@ bool fm10k_is_slot_appropriate(struct fm10k_hw *hw) return true; } +#endif /** * fm10k_update_vlan - Clear VLAN ID to VLAN filter table * @hw: pointer to hardware structure diff --git a/drivers/net/fm10k/base/fm10k_api.h b/drivers/net/fm10k/base/fm10k_api.h index 113aef5..2ab3149 100644 --- a/drivers/net/fm10k/base/fm10k_api.h +++ b/drivers/net/fm10k/base/fm10k_api.h @@ -44,7 +44,9 @@ s32 fm10k_stop_hw(struct fm10k_hw *hw); s32 fm10k_start_hw(struct fm10k_hw *hw); s32 fm10k_init_shared_code(struct fm10k_hw *hw); s32 fm10k_get_bus_info(struct fm10k_hw *hw); +#ifndef NO_IS_SLOT_APPROPRIATE_CHECK bool fm10k_is_slot_appropriate(struct fm10k_hw *hw); +#endif s32 fm10k_update_vlan(struct fm10k_hw *hw, u32 vid, u8 idx, bool set); s32 fm10k_read_mac_addr(struct fm10k_hw *hw); void fm10k_update_hw_stats(struct fm10k_hw *hw, struct fm10k_hw_stats *stats); diff --git a/drivers/net/fm10k/base/fm10k_pf.c b/drivers/net/fm10k/base/fm10k_pf.c index a1469aa..f5cbda4 100644 --- a/drivers/net/fm10k/base/fm10k_pf.c +++ b/drivers/net/fm10k/base/fm10k_pf.c @@ -216,6 +216,7 @@ STATIC s32 fm10k_init_hw_pf(struct fm10k_hw *hw) return FM10K_SUCCESS; } +#ifndef NO_IS_SLOT_APPROPRIATE_CHECK /** * fm10k_is_slot_appropriate_pf - Indicate appropriate slot for this SKU * @hw: pointer to hardware structure @@ -231,6 +232,7 @@ STATIC bool fm10k_is_slot_appropriate_pf(struct fm10k_hw *hw) (hw->bus.width == hw->bus_caps.width); } +#endif /** * fm10k_update_vlan_pf - Update status of VLAN ID in VLAN filter table * @hw: pointer to hardware structure @@ -2064,7 +2066,9 @@ s32 fm10k_init_ops_pf(struct fm10k_hw *hw) mac->ops.init_hw = _init_hw_pf; mac->ops.start_hw = _start_hw_generic; mac->ops.stop_hw = _stop_hw_generic; +#ifndef NO_IS_SLOT_APPROPRIATE_CHECK mac->ops.is_slot_appropriate = _is_slot_appropriate_pf; +#endif mac->ops.update_vlan = _update_vlan_pf; mac->ops.read_mac_addr = _read_mac_addr_pf; mac->ops.update_uc_addr = _update_uc_addr_pf; diff --git a/drivers/net/fm10k/base/fm10k_type.h b/drivers/net/fm10k/base/fm10k_type.h index c9885a1..ba0a184 100644 --- a/drivers/net/fm10k/base/fm10k_type.h +++ b/drivers/net/fm10k/base/fm10k_type.h @@ -679,7 +679,9 @@ struct fm10k_mac_ops { s32 (*stop_hw)(struct fm10k_hw *); s32 (*get_bus_info)(struct fm10k_hw *); s32 (*get_host_state)(struct fm10k_hw *, bool *); +#ifndef NO_IS_SLOT_APPROPRIATE_CHECK bool (*is_slot_appropriate)(struct fm10k_hw *); +#endif s32 (*update_vlan)(struct fm10k_hw *, u32, u8, bool); s32 (*read_mac_addr)(struct fm10k_hw *); s32 (*update_uc_addr)(struct fm10k_hw *, u16, const u8 *, diff --git a/drivers/net/fm10k/base/fm10k_vf.c b/drivers/net/fm10k/base/fm10k_vf.c index 43eb081..efbdbd1 100644 --- a/drivers/net/fm10k/base/fm10k_vf.c +++ b/drivers/net/fm10k/base/fm10k_vf.c @@ -178,6 +178,7 @@ reset_max_queues: return err; } +#ifndef NO_IS_SLOT_APPROPRIATE_CHECK /** * fm10k_is_slot_appropriate_vf - Indicate appropriate slot for this SKU * @hw: pointer to hardware structure @@ -194,6 +195,7 @@ STATIC bool fm10k_is_slot_appropriate_vf(struct fm10k_hw *hw) return TRUE; } +#endif /* This structure defines the attibutes to be parsed below */ const struct fm10k_tlv_attr fm10k_mac_vlan_msg_attr[] = { FM10K_TLV_ATTR_U32(FM10K_MAC_VLAN_MSG_VLAN), @@ -648,7 +650,9 @@ s32 fm10k_init_ops_vf(struct fm10k_hw *hw) mac->ops.init_hw = _init_hw_vf; mac->ops.start_hw = _start_hw_generic; mac->ops.stop_hw = _stop_hw_vf; +#ifndef NO_IS_SLOT_APPROPRIATE_CHECK
[dpdk-dev] [PATCH v2 10/16] fm10k/base: use memcpy for mac addr copy
Use memcpy instead of copying MAC address byte-by-byte. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_pf.c | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/fm10k/base/fm10k_pf.c b/drivers/net/fm10k/base/fm10k_pf.c index 7d48210..a1469aa 100644 --- a/drivers/net/fm10k/base/fm10k_pf.c +++ b/drivers/net/fm10k/base/fm10k_pf.c @@ -300,7 +300,6 @@ STATIC s32 fm10k_read_mac_addr_pf(struct fm10k_hw *hw) { u8 perm_addr[ETH_ALEN]; u32 serial_num; - int i; DEBUGFUNC("fm10k_read_mac_addr_pf"); @@ -324,10 +323,8 @@ STATIC s32 fm10k_read_mac_addr_pf(struct fm10k_hw *hw) perm_addr[4] = (u8)(serial_num >> 8); perm_addr[5] = (u8)(serial_num); - for (i = 0; i < ETH_ALEN; i++) { - hw->mac.perm_addr[i] = perm_addr[i]; - hw->mac.addr[i] = perm_addr[i]; - } + memcpy(hw->mac.perm_addr, perm_addr, ETH_ALEN); + memcpy(hw->mac.addr, perm_addr, ETH_ALEN); return FM10K_SUCCESS; } -- 1.9.3
[dpdk-dev] [PATCH v2 09/16] fm10k/base: do not use CamelCase
The upstream Linux kernel community prefers avoiding CamelCase in variables, function names, etc. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_type.h | 14 +++--- drivers/net/fm10k/fm10k_ethdev.c| 24 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/net/fm10k/base/fm10k_type.h b/drivers/net/fm10k/base/fm10k_type.h index 387d25b..c9885a1 100644 --- a/drivers/net/fm10k/base/fm10k_type.h +++ b/drivers/net/fm10k/base/fm10k_type.h @@ -531,13 +531,13 @@ struct fm10k_hw; #endif enum fm10k_int_source { - fm10k_int_Mailbox = 0, - fm10k_int_PCIeFault = 1, - fm10k_int_SwitchUpDown = 2, - fm10k_int_SwitchEvent = 3, - fm10k_int_SRAM = 4, - fm10k_int_VFLR = 5, - fm10k_int_MaxHoldTime = 6, + fm10k_int_mailbox = 0, + fm10k_int_pcie_fault= 1, + fm10k_int_switch_up_down= 2, + fm10k_int_switch_event = 3, + fm10k_int_sram = 4, + fm10k_int_vflr = 5, + fm10k_int_max_hold_time = 6, fm10k_int_sources_max_pf }; diff --git a/drivers/net/fm10k/fm10k_ethdev.c b/drivers/net/fm10k/fm10k_ethdev.c index 2c38ce9..a118cf4 100644 --- a/drivers/net/fm10k/fm10k_ethdev.c +++ b/drivers/net/fm10k/fm10k_ethdev.c @@ -2074,12 +2074,12 @@ fm10k_dev_enable_intr_pf(struct rte_eth_dev *dev) /* Bind all local non-queue interrupt to vector 0 */ int_map |= 0; - FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_Mailbox), int_map); - FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_PCIeFault), int_map); - FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_SwitchUpDown), int_map); - FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_SwitchEvent), int_map); - FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_SRAM), int_map); - FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_VFLR), int_map); + FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_mailbox), int_map); + FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_pcie_fault), int_map); + FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_switch_up_down), int_map); + FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_switch_event), int_map); + FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_sram), int_map); + FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_vflr), int_map); /* Enable misc causes */ FM10K_WRITE_REG(hw, FM10K_EIMR, FM10K_EIMR_ENABLE(PCA_FAULT) | @@ -2105,12 +2105,12 @@ fm10k_dev_disable_intr_pf(struct rte_eth_dev *dev) int_map |= 0; - FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_Mailbox), int_map); - FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_PCIeFault), int_map); - FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_SwitchUpDown), int_map); - FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_SwitchEvent), int_map); - FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_SRAM), int_map); - FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_VFLR), int_map); + FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_mailbox), int_map); + FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_pcie_fault), int_map); + FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_switch_up_down), int_map); + FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_switch_event), int_map); + FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_sram), int_map); + FM10K_WRITE_REG(hw, FM10K_INT_MAP(fm10k_int_vflr), int_map); /* Disable misc causes */ FM10K_WRITE_REG(hw, FM10K_EIMR, FM10K_EIMR_DISABLE(PCA_FAULT) | -- 1.9.3
[dpdk-dev] [PATCH v2 07/16] fm10k/base: fix checkpatch warning
Cleanup lines over 80 characters. Cleanup useless else, checkpatch warns that else is not generally useful after a break or return. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_mbx.c | 2 +- drivers/net/fm10k/base/fm10k_pf.c | 19 ++- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/net/fm10k/base/fm10k_mbx.c b/drivers/net/fm10k/base/fm10k_mbx.c index 3c9ab3a..7d03704 100644 --- a/drivers/net/fm10k/base/fm10k_mbx.c +++ b/drivers/net/fm10k/base/fm10k_mbx.c @@ -930,7 +930,7 @@ STATIC void fm10k_mbx_create_disconnect_hdr(struct fm10k_mbx_info *mbx) } /** - * fm10k_mbx_create_fake_disconnect_hdr - Generate a false disconnect mailbox header + * fm10k_mbx_create_fake_disconnect_hdr - Generate a false disconnect mbox hdr * @mbx: pointer to mailbox * * This function creates a fake disconnect header for loading into remote diff --git a/drivers/net/fm10k/base/fm10k_pf.c b/drivers/net/fm10k/base/fm10k_pf.c index 6de679e..3ee88b6 100644 --- a/drivers/net/fm10k/base/fm10k_pf.c +++ b/drivers/net/fm10k/base/fm10k_pf.c @@ -1278,8 +1278,8 @@ s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *hw, u32 **results, err = fm10k_iov_select_vid(vf_info, (u16)vid); if (err < 0) return err; - else - vid = err; + + vid = err; /* update VSI info for VF in regards to VLAN table */ err = hw->mac.ops.update_vlan(hw, vid, vf_info->vsi, set); @@ -1304,8 +1304,8 @@ s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *hw, u32 **results, err = fm10k_iov_select_vid(vf_info, vlan); if (err < 0) return err; - else - vlan = (u16)err; + + vlan = (u16)err; /* notify switch of request for new unicast address */ err = hw->mac.ops.update_uc_addr(hw, vf_info->glort, @@ -1330,8 +1330,8 @@ s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *hw, u32 **results, err = fm10k_iov_select_vid(vf_info, vlan); if (err < 0) return err; - else - vlan = (u16)err; + + vlan = (u16)err; /* notify switch of request for new multicast address */ err = hw->mac.ops.update_mc_addr(hw, vf_info->glort, @@ -1500,9 +1500,10 @@ STATIC void fm10k_update_hw_stats_pf(struct fm10k_hw *hw, xec = fm10k_read_hw_stats_32b(hw, FM10K_STATS_XEC, >xec); vlan_drop = fm10k_read_hw_stats_32b(hw, FM10K_STATS_VLAN_DROP, >vlan_drop); - loopback_drop = fm10k_read_hw_stats_32b(hw, - FM10K_STATS_LOOPBACK_DROP, - >loopback_drop); + loopback_drop = + fm10k_read_hw_stats_32b(hw, + FM10K_STATS_LOOPBACK_DROP, + >loopback_drop); nodesc_drop = fm10k_read_hw_stats_32b(hw, FM10K_STATS_NODESC_DROP, >nodesc_drop); -- 1.9.3
[dpdk-dev] [PATCH v2 06/16] fm10k/base: document ITR scale workaround in VF TDLEN register
Add comments which properly explain the undocumented use of bits in TDLEN register prior to VF initializing it to the correct value. Note that the mechanism is entirely software-defined and explain its purpose to help reduce confusion in the future. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_pf.c | 6 +- drivers/net/fm10k/base/fm10k_type.h | 9 + drivers/net/fm10k/base/fm10k_vf.c | 9 + 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/net/fm10k/base/fm10k_pf.c b/drivers/net/fm10k/base/fm10k_pf.c index 5b8c039..6de679e 100644 --- a/drivers/net/fm10k/base/fm10k_pf.c +++ b/drivers/net/fm10k/base/fm10k_pf.c @@ -958,7 +958,8 @@ STATIC s32 fm10k_iov_assign_default_mac_vlan_pf(struct fm10k_hw *hw, FM10K_WRITE_REG(hw, FM10K_TDBAH(vf_q_idx), tdbah); /* Provide the VF the ITR scale, using software-defined fields in TDLEN -* to pass the information during VF initialization +* to pass the information during VF initialization. See definition of +* FM10K_TDLEN_ITR_SCALE_SHIFT for more details. */ FM10K_WRITE_REG(hw, FM10K_TDLEN(vf_q_idx), hw->mac.itr_scale << FM10K_TDLEN_ITR_SCALE_SHIFT); @@ -1095,6 +1096,9 @@ STATIC s32 fm10k_iov_reset_resources_pf(struct fm10k_hw *hw, for (i = queues_per_pool; i--;) { FM10K_WRITE_REG(hw, FM10K_TDBAL(vf_q_idx + i), tdbal); FM10K_WRITE_REG(hw, FM10K_TDBAH(vf_q_idx + i), tdbah); + /* See definition of FM10K_TDLEN_ITR_SCALE_SHIFT for an +* explanation of how TDLEN is used. +*/ FM10K_WRITE_REG(hw, FM10K_TDLEN(vf_q_idx + i), hw->mac.itr_scale << FM10K_TDLEN_ITR_SCALE_SHIFT); diff --git a/drivers/net/fm10k/base/fm10k_type.h b/drivers/net/fm10k/base/fm10k_type.h index 44187b1..5db6345 100644 --- a/drivers/net/fm10k/base/fm10k_type.h +++ b/drivers/net/fm10k/base/fm10k_type.h @@ -350,6 +350,15 @@ struct fm10k_hw; #define FM10K_TDBAL(_n)((0x40 * (_n)) + 0x8000) #define FM10K_TDBAH(_n)((0x40 * (_n)) + 0x8001) #define FM10K_TDLEN(_n)((0x40 * (_n)) + 0x8002) +/* When fist initialized, VFs need to know the Interrupt Throttle Rate (ITR) + * scale which is based on the PCIe speed but the speed information in the PCI + * configuration space may not be accurate. The PF already knows the ITR scale + * but there is no defined method to pass that information from the PF to the + * VF. This is accomplished during VF initialization by temporarily co-opting + * the yet-to-be-used TDLEN register to have the PF store the ITR shift for + * the VF to retrieve before the VF needs to use the TDLEN register for its + * intended purpose, i.e. before the Tx resources are allocated. + */ #define FM10K_TDLEN_ITR_SCALE_SHIFT9 #define FM10K_TDLEN_ITR_SCALE_MASK 0x0E00 #define FM10K_TDLEN_ITR_SCALE_GEN1 2 diff --git a/drivers/net/fm10k/base/fm10k_vf.c b/drivers/net/fm10k/base/fm10k_vf.c index 9b10ee4..43eb081 100644 --- a/drivers/net/fm10k/base/fm10k_vf.c +++ b/drivers/net/fm10k/base/fm10k_vf.c @@ -74,6 +74,11 @@ STATIC s32 fm10k_stop_hw_vf(struct fm10k_hw *hw) FM10K_WRITE_REG(hw, FM10K_TDBAH(i), bah); FM10K_WRITE_REG(hw, FM10K_RDBAL(i), bal); FM10K_WRITE_REG(hw, FM10K_RDBAH(i), bah); + /* Restore ITR scale in software-defined mechanism in TDLEN +* for next VF initialization. See definition of +* FM10K_TDLEN_ITR_SCALE_SHIFT for more details on the use of +* TDLEN here. +*/ FM10K_WRITE_REG(hw, FM10K_TDLEN(i), tdlen); } @@ -157,6 +162,10 @@ STATIC s32 fm10k_init_hw_vf(struct fm10k_hw *hw) /* fetch default VLAN and ITR scale */ hw->mac.default_vid = (FM10K_READ_REG(hw, FM10K_TXQCTL(0)) & FM10K_TXQCTL_VID_MASK) >> FM10K_TXQCTL_VID_SHIFT; + /* Read the ITR scale from TDLEN. See the definition of +* FM10K_TDLEN_ITR_SCALE_SHIFT for more information about how TDLEN is +* used here. +*/ hw->mac.itr_scale = (FM10K_READ_REG(hw, FM10K_TDLEN(0)) & FM10K_TDLEN_ITR_SCALE_MASK) >> FM10K_TDLEN_ITR_SCALE_SHIFT; -- 1.9.3
[dpdk-dev] [PATCH v2 05/16] fm10k/base: reset max_queues on init_hw_vf failure
VF drivers must detect how many queues are available. Previously, the driver assumed that each VF has at minimum 1 queue. This assumption is incorrect, since it is possible that the PF has not yet assigned the queues to the VF by the time the VF checks. To resolve this, we added a check first to ensure that the first queue is infact owned by the VF at init_hw_vf time. However, the code flow did not reset hw->mac.max_queues to 0. In some cases, such as during reinit flows, we call init_hw_vf without clearing the previous value of hw->mac.max_queues. Due to this, when init_hw_vf errors out, if its error code is not properly handled the VF driver may still believe it has queues which no longer belong to it. Fix this by clearing the hw->mac.max_queues on exit due to errors. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_vf.c | 13 ++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/fm10k/base/fm10k_vf.c b/drivers/net/fm10k/base/fm10k_vf.c index 39bc927..9b10ee4 100644 --- a/drivers/net/fm10k/base/fm10k_vf.c +++ b/drivers/net/fm10k/base/fm10k_vf.c @@ -128,8 +128,10 @@ STATIC s32 fm10k_init_hw_vf(struct fm10k_hw *hw) /* verify we have at least 1 queue */ if (!~FM10K_READ_REG(hw, FM10K_TXQCTL(0)) || - !~FM10K_READ_REG(hw, FM10K_RXQCTL(0))) - return FM10K_ERR_NO_RESOURCES; + !~FM10K_READ_REG(hw, FM10K_RXQCTL(0))) { + err = FM10K_ERR_NO_RESOURCES; + goto reset_max_queues; + } /* determine how many queues we have */ for (i = 1; tqdloc0 && (i < FM10K_MAX_QUEUES_POOL); i++) { @@ -147,7 +149,7 @@ STATIC s32 fm10k_init_hw_vf(struct fm10k_hw *hw) /* shut down queues we own and reset DMA configuration */ err = fm10k_disable_queues_generic(hw, i); if (err) - return err; + goto reset_max_queues; /* record maximum queue count */ hw->mac.max_queues = i; @@ -160,6 +162,11 @@ STATIC s32 fm10k_init_hw_vf(struct fm10k_hw *hw) FM10K_TDLEN_ITR_SCALE_SHIFT; return FM10K_SUCCESS; + +reset_max_queues: + hw->mac.max_queues = 0; + + return err; } /** -- 1.9.3
[dpdk-dev] [PATCH v2 04/16] fm10k/base: use bitshift for itr_scale
Upstream community wishes us to use bitshift instead of a divisor, because this is faster, and prevents any need for a '0' check. In our case, this even works out because default Gen3 will be 0. Because of this, we are also able to remove the check for non-zero value in the vf code path since that will already be the default Gen3 case. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_type.h | 6 +++--- drivers/net/fm10k/base/fm10k_vf.c | 4 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/fm10k/base/fm10k_type.h b/drivers/net/fm10k/base/fm10k_type.h index 62fa73f..44187b1 100644 --- a/drivers/net/fm10k/base/fm10k_type.h +++ b/drivers/net/fm10k/base/fm10k_type.h @@ -352,9 +352,9 @@ struct fm10k_hw; #define FM10K_TDLEN(_n)((0x40 * (_n)) + 0x8002) #define FM10K_TDLEN_ITR_SCALE_SHIFT9 #define FM10K_TDLEN_ITR_SCALE_MASK 0x0E00 -#define FM10K_TDLEN_ITR_SCALE_GEN1 4 -#define FM10K_TDLEN_ITR_SCALE_GEN2 2 -#define FM10K_TDLEN_ITR_SCALE_GEN3 1 +#define FM10K_TDLEN_ITR_SCALE_GEN1 2 +#define FM10K_TDLEN_ITR_SCALE_GEN2 1 +#define FM10K_TDLEN_ITR_SCALE_GEN3 0 #define FM10K_TPH_TXCTRL(_n) ((0x40 * (_n)) + 0x8003) #define FM10K_TPH_TXCTRL_DESC_TPHEN0x0020 #define FM10K_TPH_TXCTRL_DESC_RROEN0x0200 diff --git a/drivers/net/fm10k/base/fm10k_vf.c b/drivers/net/fm10k/base/fm10k_vf.c index 7822ab6..39bc927 100644 --- a/drivers/net/fm10k/base/fm10k_vf.c +++ b/drivers/net/fm10k/base/fm10k_vf.c @@ -159,10 +159,6 @@ STATIC s32 fm10k_init_hw_vf(struct fm10k_hw *hw) FM10K_TDLEN_ITR_SCALE_MASK) >> FM10K_TDLEN_ITR_SCALE_SHIFT; - /* ensure a non-zero itr scale */ - if (!hw->mac.itr_scale) - hw->mac.itr_scale = FM10K_TDLEN_ITR_SCALE_GEN3; - return FM10K_SUCCESS; } -- 1.9.3
[dpdk-dev] [PATCH v2 03/16] fm10k/base: cleanup namespace pollution and correct typecast
Correct typecast in fm10k_update_xc_addr_pf. Make functions that are only referenced locally static. And fix the function header comment for fm10k_tlv_attr_nest_stop() while we're at it. Wrap fm10k_msg_data fm10k_iov_msg_data_pf[] in the new ifndef NO_DEFAULT_SRIOV_MSG_HANDLERS so that drivers with custom SR-IOV message handlers can strip it. remove unused struct element in struct fm10k_mac_ops. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_pf.c | 10 ++ drivers/net/fm10k/base/fm10k_pf.h | 4 ++-- drivers/net/fm10k/base/fm10k_tlv.c | 16 drivers/net/fm10k/base/fm10k_tlv.h | 5 - drivers/net/fm10k/base/fm10k_type.h | 1 - drivers/net/fm10k/base/fm10k_vf.c | 2 -- 6 files changed, 16 insertions(+), 22 deletions(-) diff --git a/drivers/net/fm10k/base/fm10k_pf.c b/drivers/net/fm10k/base/fm10k_pf.c index 6e6d71e..5b8c039 100644 --- a/drivers/net/fm10k/base/fm10k_pf.c +++ b/drivers/net/fm10k/base/fm10k_pf.c @@ -379,8 +379,8 @@ STATIC s32 fm10k_update_xc_addr_pf(struct fm10k_hw *hw, u16 glort, ((u32)mac[3] << 16) | ((u32)mac[4] << 8) | ((u32)mac[5])); - mac_update.mac_upper = FM10K_CPU_TO_LE16(((u32)mac[0] << 8) | -((u32)mac[1])); + mac_update.mac_upper = FM10K_CPU_TO_LE16(((u16)mac[0] << 8) | + ((u16)mac[1])); mac_update.vlan = FM10K_CPU_TO_LE16(vid); mac_update.glort = FM10K_CPU_TO_LE16(glort); mac_update.action = add ? 0 : 1; @@ -1457,6 +1457,7 @@ s32 fm10k_iov_msg_lport_state_pf(struct fm10k_hw *hw, u32 **results, return err; } +#ifndef NO_DEFAULT_SRIOV_MSG_HANDLERS const struct fm10k_msg_data fm10k_iov_msg_data_pf[] = { FM10K_TLV_MSG_TEST_HANDLER(fm10k_tlv_msg_test), FM10K_VF_MSG_MSIX_HANDLER(fm10k_iov_msg_msix_pf), @@ -1465,6 +1466,7 @@ const struct fm10k_msg_data fm10k_iov_msg_data_pf[] = { FM10K_TLV_MSG_ERROR_HANDLER(fm10k_tlv_msg_error), }; +#endif /** * fm10k_update_stats_hw_pf - Updates hardware related statistics of PF * @hw: pointer to hardware structure @@ -1754,8 +1756,8 @@ const struct fm10k_tlv_attr fm10k_update_pvid_msg_attr[] = { * * This handler configures the default VLAN for the PF **/ -s32 fm10k_msg_update_pvid_pf(struct fm10k_hw *hw, u32 **results, -struct fm10k_mbx_info *mbx) +static s32 fm10k_msg_update_pvid_pf(struct fm10k_hw *hw, u32 **results, + struct fm10k_mbx_info *mbx) { u16 glort, pvid; u32 pvid_update; diff --git a/drivers/net/fm10k/base/fm10k_pf.h b/drivers/net/fm10k/base/fm10k_pf.h index 44bd193..92e2962 100644 --- a/drivers/net/fm10k/base/fm10k_pf.h +++ b/drivers/net/fm10k/base/fm10k_pf.h @@ -149,8 +149,6 @@ extern const struct fm10k_tlv_attr fm10k_lport_map_msg_attr[]; #define FM10K_PF_MSG_LPORT_MAP_HANDLER(func) \ FM10K_MSG_HANDLER(FM10K_PF_MSG_ID_LPORT_MAP, \ fm10k_lport_map_msg_attr, func) -s32 fm10k_msg_update_pvid_pf(struct fm10k_hw *, u32 **, -struct fm10k_mbx_info *); extern const struct fm10k_tlv_attr fm10k_update_pvid_msg_attr[]; #define FM10K_PF_MSG_UPDATE_PVID_HANDLER(func) \ FM10K_MSG_HANDLER(FM10K_PF_MSG_ID_UPDATE_PVID, \ @@ -183,7 +181,9 @@ s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *, u32 **, struct fm10k_mbx_info *); s32 fm10k_iov_msg_lport_state_pf(struct fm10k_hw *, u32 **, struct fm10k_mbx_info *); +#ifndef NO_DEFAULT_SRIOV_MSG_HANDLERS extern const struct fm10k_msg_data fm10k_iov_msg_data_pf[]; +#endif s32 fm10k_init_ops_pf(struct fm10k_hw *hw); #endif /* _FM10K_PF_H */ diff --git a/drivers/net/fm10k/base/fm10k_tlv.c b/drivers/net/fm10k/base/fm10k_tlv.c index 1d9d7d8..ade87d1 100644 --- a/drivers/net/fm10k/base/fm10k_tlv.c +++ b/drivers/net/fm10k/base/fm10k_tlv.c @@ -63,8 +63,8 @@ s32 fm10k_tlv_msg_init(u32 *msg, u16 msg_id) * the attribute buffer. It will return success if provided with a valid * pointers. **/ -s32 fm10k_tlv_attr_put_null_string(u32 *msg, u16 attr_id, - const unsigned char *string) +static s32 fm10k_tlv_attr_put_null_string(u32 *msg, u16 attr_id, + const unsigned char *string) { u32 attr_data = 0, len = 0; u32 *attr; @@ -115,7 +115,7 @@ s32 fm10k_tlv_attr_put_null_string(u32 *msg, u16 attr_id, * it in the array pointed by by string. It will return success if provided * with a valid pointers. **/ -s32 fm10k_tlv_attr_get_null_string(u32 *attr, unsigned char *string) +static s32 fm10k_tlv_attr_get_null_string(u32 *attr, unsigned char *string) { u32 len; @@ -386,7 +386,7 @@ s32 fm10k_tlv_attr_get_le_struct(u32 *attr,
[dpdk-dev] [PATCH v2 02/16] fm10k/base: add macro definitions that are needed
Some macros such as FM10K_RXINT_TIMER_SHIFT are removed in the share code drop, but they are needed in dpdk/fm10k. This patch put all these necessary macros into fm10k_osdep.h Signed-off-by: Wang Xiao W --- drivers/net/fm10k/base/fm10k_osdep.h | 30 ++ 1 file changed, 30 insertions(+) diff --git a/drivers/net/fm10k/base/fm10k_osdep.h b/drivers/net/fm10k/base/fm10k_osdep.h index 6852ef0..869af1b 100644 --- a/drivers/net/fm10k/base/fm10k_osdep.h +++ b/drivers/net/fm10k/base/fm10k_osdep.h @@ -150,6 +150,36 @@ typedef intbool; #define fm10k_read_reg FM10K_READ_REG #endif +#define FM10K_INTEL_VENDOR_ID 0x8086 +#define FM10K_DMA_CTRL_MINMSS_SHIFT9 +#define FM10K_EICR_PCA_FAULT 0x0001 +#define FM10K_EICR_THI_FAULT 0x0004 +#define FM10K_EICR_FUM_FAULT 0x0020 +#define FM10K_EICR_SRAMERROR 0x0400 +#define FM10K_SRAM_IP 0x13003 +#define FM10K_RXINT_TIMER_SHIFT8 +#define FM10K_TXINT_TIMER_SHIFT8 +#define FM10K_RXD_PKTTYPE_MASK 0x03F0 +#define FM10K_RXD_PKTTYPE_SHIFT4 +enum fm10k_rdesc_pkt_type { + /* L3 type */ + FM10K_PKTTYPE_OTHER = 0x00, + FM10K_PKTTYPE_IPV4 = 0x01, + FM10K_PKTTYPE_IPV4_EX = 0x02, + FM10K_PKTTYPE_IPV6 = 0x03, + FM10K_PKTTYPE_IPV6_EX = 0x04, + + /* L4 type */ + FM10K_PKTTYPE_TCP = 0x08, + FM10K_PKTTYPE_UDP = 0x10, + FM10K_PKTTYPE_GRE = 0x18, + FM10K_PKTTYPE_VXLAN = 0x20, + FM10K_PKTTYPE_NVGRE = 0x28, + FM10K_PKTTYPE_GENEVE= 0x30 +}; +#define FM10K_RXD_STATUS_IPCS 0x0008 /* Indicates IPv4 csum */ +#define FM10K_RXD_STATUS_HBO 0x0400 /* header buffer overrun */ + #define FM10K_TSO_MINMSS \ (FM10K_DMA_CTRL_MINMSS_64 >> FM10K_DMA_CTRL_MINMSS_SHIFT) #define FM10K_TSO_MIN_HEADERLEN54 -- 1.9.3
[dpdk-dev] [PATCH v2 01/16] fm10k: use default mailbox message handler for pf
The new share code makes fm10k_msg_update_pvid_pf function static, so we can not refer to it now in fm10k_ethdev.c. The registered pf handler is almost the same as the default pf handler, removing it has no impact on mailbox. Signed-off-by: Wang Xiao W --- drivers/net/fm10k/fm10k_ethdev.c | 17 ++--- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/drivers/net/fm10k/fm10k_ethdev.c b/drivers/net/fm10k/fm10k_ethdev.c index e4aed94..2c38ce9 100644 --- a/drivers/net/fm10k/fm10k_ethdev.c +++ b/drivers/net/fm10k/fm10k_ethdev.c @@ -2367,29 +2367,16 @@ static const struct fm10k_msg_data fm10k_msgdata_vf[] = { FM10K_TLV_MSG_ERROR_HANDLER(fm10k_tlv_msg_error), }; -/* Mailbox message handler in PF */ -static const struct fm10k_msg_data fm10k_msgdata_pf[] = { - FM10K_PF_MSG_ERR_HANDLER(XCAST_MODES, fm10k_msg_err_pf), - FM10K_PF_MSG_ERR_HANDLER(UPDATE_MAC_FWD_RULE, fm10k_msg_err_pf), - FM10K_PF_MSG_LPORT_MAP_HANDLER(fm10k_msg_lport_map_pf), - FM10K_PF_MSG_ERR_HANDLER(LPORT_CREATE, fm10k_msg_err_pf), - FM10K_PF_MSG_ERR_HANDLER(LPORT_DELETE, fm10k_msg_err_pf), - FM10K_PF_MSG_UPDATE_PVID_HANDLER(fm10k_msg_update_pvid_pf), - FM10K_TLV_MSG_ERROR_HANDLER(fm10k_tlv_msg_error), -}; - static int fm10k_setup_mbx_service(struct fm10k_hw *hw) { - int err; + int err = 0; /* Initialize mailbox lock */ fm10k_mbx_initlock(hw); /* Replace default message handler with new ones */ - if (hw->mac.type == fm10k_mac_pf) - err = hw->mbx.ops.register_handlers(>mbx, fm10k_msgdata_pf); - else + if (hw->mac.type == fm10k_mac_vf) err = hw->mbx.ops.register_handlers(>mbx, fm10k_msgdata_vf); if (err) { -- 1.9.3
[dpdk-dev] [PATCH v2 00/16] fm10k: update shared code
v2: * Put the two extra fix patches ahead of the base code patches. Wang Xiao W (16): fm10k: use default mailbox message handler for pf fm10k/base: add macro definitions that are needed fm10k/base: cleanup namespace pollution and correct typecast fm10k/base: use bitshift for itr_scale fm10k/base: reset max_queues on init_hw_vf failure fm10k/base: document ITR scale workaround in VF TDLEN register fm10k/base: fix checkpatch warning fm10k/base: use BIT macro instead of open-coded bit-shifting of 1 fm10k/base: do not use CamelCase fm10k/base: use memcpy for mac addr copy fm10k/base: allow removal of is_slot_appropriate function fm10k/base: consistently use VLAN ID when referencing vid variables fm10k/base: fix comment per upstream review changes fm10k/base: TLV structures must be 4byte aligned, not 1byte aligned fm10k/base: move constants to the right of binary operators fm10k/base: minor cleanups drivers/net/fm10k/base/fm10k_api.c | 2 + drivers/net/fm10k/base/fm10k_api.h | 2 + drivers/net/fm10k/base/fm10k_mbx.c | 63 +++- drivers/net/fm10k/base/fm10k_mbx.h | 11 +-- drivers/net/fm10k/base/fm10k_osdep.h | 30 ++ drivers/net/fm10k/base/fm10k_pf.c| 88 + drivers/net/fm10k/base/fm10k_pf.h| 18 ++-- drivers/net/fm10k/base/fm10k_tlv.c | 40 drivers/net/fm10k/base/fm10k_tlv.h | 9 +- drivers/net/fm10k/base/fm10k_type.h | 182 +++ drivers/net/fm10k/base/fm10k_vf.c| 32 -- drivers/net/fm10k/fm10k_ethdev.c | 41 +++- 12 files changed, 220 insertions(+), 298 deletions(-) -- 1.9.3
[dpdk-dev] [PATCH v5 8/9] virtio: add 1.0 support
On Thu, Jan 21, 2016 at 12:37:42PM +0100, Thomas Monjalon wrote: > 2016-01-19 16:12, Yuanhan Liu: > > +#define IO_READ_DEF(nr_bits, type) \ > > +static inline type \ > > +io_read##nr_bits(type *addr) \ > > +{ \ > > + return *(volatile type *)addr; \ > > +} > > + > > +#define IO_WRITE_DEF(nr_bits, type)\ > > +static inline void \ > > +io_write##nr_bits(type val, type *addr)\ > > +{ \ > > + *(volatile type *)addr = val; \ > > +} > > + > > +IO_READ_DEF (8, uint8_t) > > +IO_WRITE_DEF(8, uint8_t) > > + > > +IO_READ_DEF (16, uint16_t) > > +IO_WRITE_DEF(16, uint16_t) > > + > > +IO_READ_DEF (32, uint32_t) > > +IO_WRITE_DEF(32, uint32_t) > > Yes you can do this. > But not sure you should. > > > +static inline void > > +io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi) > > +{ > > + io_write32(val & ((1ULL << 32) - 1), lo); > > + io_write32(val >> 32,hi); > > +} > > When debugging this code, how GDB behave? > How to find the definition of io_write32() with grep or simple editors? Okay, I will unfold them. --yliu
[dpdk-dev] [PATCH v2] ip_pipeline: fix cpu socket-id error
This patch fixes the socket-id error in ip_pipeline sample application running over uni-processor systems. Signed-off-by: Jasvinder Singh Acked-by: Cristian Dumitrescu --- v2: - used SOCKET_ID_ANY instead of -1 examples/ip_pipeline/init.c | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/ip_pipeline/init.c b/examples/ip_pipeline/init.c index 186ca03..c4601c9 100644 --- a/examples/ip_pipeline/init.c +++ b/examples/ip_pipeline/init.c @@ -835,6 +835,14 @@ app_init_link_frag_ras(struct app_params *app) } } +static inline int +app_get_cpu_socket_id(uint32_t pmd_id) +{ + int status = rte_eth_dev_socket_id(pmd_id); + + return (status != SOCKET_ID_ANY) ? status : 0; +} + static void app_init_link(struct app_params *app) { @@ -890,7 +898,7 @@ app_init_link(struct app_params *app) p_link->pmd_id, rxq_queue_id, p_rxq->size, - rte_eth_dev_socket_id(p_link->pmd_id), + app_get_cpu_socket_id(p_link->pmd_id), _rxq->conf, app->mempool[p_rxq->mempool_id]); if (status < 0) @@ -917,7 +925,7 @@ app_init_link(struct app_params *app) p_link->pmd_id, txq_queue_id, p_txq->size, - rte_eth_dev_socket_id(p_link->pmd_id), + app_get_cpu_socket_id(p_link->pmd_id), _txq->conf); if (status < 0) rte_panic("%s (%" PRIu32 "): " @@ -989,7 +997,7 @@ app_init_tm(struct app_params *app) /* TM */ p_tm->sched_port_params.name = p_tm->name; p_tm->sched_port_params.socket = - rte_eth_dev_socket_id(p_link->pmd_id); + app_get_cpu_socket_id(p_link->pmd_id); p_tm->sched_port_params.rate = (uint64_t) link_eth_params.link_speed * 1000 * 1000 / 8; -- 2.5.0
[dpdk-dev] [PATCH v5 8/9] virtio: add 1.0 support
On Thu, Jan 21, 2016 at 12:49:10PM +0100, Thomas Monjalon wrote: > 2016-01-19 16:12, Yuanhan Liu: > > int > > vtpci_init(struct rte_pci_device *dev, struct virtio_hw *hw) > > { > > - hw->vtpci_ops = _ops; > > + hw->dev = dev; > > + > > + /* > > +* Try if we can succeed reading virtio pci caps, which exists > > +* only on modern pci device. If failed, we fallback to legacy > > +* virtio handling. > > +*/ > > + if (virtio_read_caps(dev, hw) == 0) { > > + PMD_INIT_LOG(INFO, "modern virtio pci detected."); > > + hw->vtpci_ops = _ops; > > + hw->modern= 1; > > + dev->driver->drv_flags |= RTE_PCI_DRV_INTR_LSC; > > + return 0; > > + } > > RTE_PCI_DRV_INTR_LSC is already set by virtio_resource_init_by_uio(). We don't go that far here. Here we just detect if it's a modern virtio device. And if yes, we do some modern initiations, and return. virtio_resource_init_by_uio() is invoked when virtio_read_caps() fails. > Do you mean interrupt was not supported with legacy virtio? Nope. this patch set changes nothing on legacy virtio support. --yliu
[dpdk-dev] DPDK OVS on Ubuntu 14.04# Issue's Resolved# Inter-VM communication & IP allocation through DHCP issue
Hi Abhijeet, It seems you are almost there! When booting the VM?s do you request hugepage memory for them (by setting hw:mem_page_size=large in flavor extra_spec)? If not then please do, if yes then please look into libvirt logfiles for the VM?s (in /var/log/libvirt/qemu/instance-xxx), I think there could be a clue. Regards Przemek From: Abhijeet Karve [mailto:abhijeet.ka...@tcs.com] Sent: Monday, January 25, 2016 6:13 PM To: Czesnowicz, Przemyslaw Cc: dev at dpdk.org; discuss at openvswitch.org; Gray, Mark D Subject: RE: [dpdk-dev] DPDK OVS on Ubuntu 14.04# Issue's Resolved# Inter-VM communication & IP allocation through DHCP issue Hi Przemek, Thank you for your response, It really provided us breakthrough. After setting up DPDK on compute node for stable/kilo, We are trying to set up Openstack stable/liberty all-in-one setup, At present we are not able to get the IP allocation for the vhost type instances through DHCP. Also we tried assigning IP's manually to them but the inter-VM communication also not happening, #neutron agent-list root at nfv-dpdk-devstack:/etc/neutron# neutron agent-list +--++---+---++---+ | id | agent_type | host | alive | admin_state_up | binary| +--++---+---++---+ | 3b29e93c-3a25-4f7d-bf6c-6bb309db5ec0 | DPDK OVS Agent | nfv-dpdk-devstack | :-) | True | neutron-openvswitch-agent | | 62593b2c-c10f-4d93-8551-c46ce24895a6 | L3 agent | nfv-dpdk-devstack | :-) | True | neutron-l3-agent | | 7cb97af9-cc20-41f8-90fb-aba97d39dfbd | DHCP agent | nfv-dpdk-devstack | :-) | True | neutron-dhcp-agent| | b613c654-99b7-437e-9317-20fa651a1310 | Linux bridge agent | nfv-dpdk-devstack | :-) | True | neutron-linuxbridge-agent | | c2dd0384-6517-4b44-9c25-0d2825d23f57 | Metadata agent | nfv-dpdk-devstack | :-) | True | neutron-metadata-agent| | f23dde40-7dc0-4f20-8b3e-eb90ddb15e49 | Open vSwitch agent | nfv-dpdk-devstack | xxx | True | neutron-openvswitch-agent | +--++---+---++---+ ovs-vsctl show output# Bridge br-dpdk Port br-dpdk Interface br-dpdk type: internal Port phy-br-dpdk Interface phy-br-dpdk type: patch options: {peer=int-br-dpdk} Bridge br-int fail_mode: secure Port "vhufa41e799-f2" tag: 5 Interface "vhufa41e799-f2" type: dpdkvhostuser Port int-br-dpdk Interface int-br-dpdk type: patch options: {peer=phy-br-dpdk} Port "tap4e19f8e1-59" tag: 5 Interface "tap4e19f8e1-59" type: internal Port "vhu05734c49-3b" tag: 5 Interface "vhu05734c49-3b" type: dpdkvhostuser Port "vhu10c06b4d-84" tag: 5 Interface "vhu10c06b4d-84" type: dpdkvhostuser Port patch-tun Interface patch-tun type: patch options: {peer=patch-int} Port "vhue169c581-ef" tag: 5 Interface "vhue169c581-ef" type: dpdkvhostuser Port br-int Interface br-int type: internal Bridge br-tun fail_mode: secure Port br-tun Interface br-tun type: internal error: "could not open network device br-tun (Invalid argument)" Port patch-int Interface patch-int type: patch options: {peer=patch-tun} ovs_version: "2.4.0" ovs-ofctl dump-flows br-int# root at nfv-dpdk-devstack:/etc/neutron# ovs-ofctl dump-flows br-int NXST_FLOW reply (xid=0x4): cookie=0xaaa002bb2bcf827b, duration=2410.012s, table=0, n_packets=0, n_bytes=0, idle_age=2410, priority=10,icmp6,in_port=43,icmp_type=136 actions=resubmit(,24) cookie=0xaaa002bb2bcf827b, duration=2409.480s, table=0, n_packets=0, n_bytes=0, idle_age=2409, priority=10,icmp6,in_port=44,icmp_type=136 actions=resubmit(,24) cookie=0xaaa002bb2bcf827b, duration=2408.704s, table=0, n_packets=0, n_bytes=0, idle_age=2408, priority=10,icmp6,in_port=45,icmp_type=136 actions=resubmit(,24) cookie=0xaaa002bb2bcf827b, duration=2408.155s, table=0, n_packets=0, n_bytes=0, idle_age=2408,
[dpdk-dev] [PATCH 2/5] vhost: refactor virtio_dev_rx
On Thu, Jan 21, 2016 at 02:50:01PM +0100, J?r?me Jutteau wrote: > Hi Yuanhan, > > 2015-12-14 2:47 GMT+01:00 Yuanhan Liu : > > Right, I should move it in the beginning of this function. > > Any news about this refactoring ? Hi J?r?me, Thanks for showing interests in this patch set; I was waiting for Huawei's comments. And fortunately, he starts making comments. --yliu
[dpdk-dev] [PATCH 1/5] vhost: refactor rte_vhost_dequeue_burst
On Tue, Jan 26, 2016 at 10:30:12AM +, Xie, Huawei wrote: > On 12/3/2015 2:03 PM, Yuanhan Liu wrote: > > Signed-off-by: Yuanhan Liu > > --- > > lib/librte_vhost/vhost_rxtx.c | 287 > > +- > > 1 file changed, 113 insertions(+), 174 deletions(-) > > Prefer to unroll copy_mbuf_to_desc and your COPY macro. It prevents us I'm okay to unroll COPY macro. But for copy_mbuf_to_desc, I prefer not to do that, unless it has a good reason. > processing descriptors in a burst way in future. So, do you have a plan? --yliu
[dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode
On Wed, Jan 27, 2016 at 11:12 AM, Thomas Monjalon wrote: > 2016-01-27 10:08, Burakov, Anatoly: >> > Why a new file for these functions? >> >> Well, my thought was to make future extensions easier by way of avoiding >> mixing irrelevant and/or general code with driver-specific code. I can >> change it back if that's not OK. > > No strong opinion here. > David? Hum, no strong opinion either, but I don't think we really need to split this file for this much code. Besides, if we keep all code in eal_pci_vfio.c, there is no need to expose those structures through eal_pci_init.h. -- David Marchand
[dpdk-dev] [PATCH 4/5] vhost: do not use rte_memcpy for virtio_hdr copy
On Wed, Jan 27, 2016 at 02:46:39AM +, Xie, Huawei wrote: > On 12/3/2015 2:03 PM, Yuanhan Liu wrote: > > + if (vq->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) { > > + *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr; > > + } else { > > + *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr; > > + } > > Thanks! > We might simplify this further. Just reset the first two fields flags > and gso_type. What's this "simplification" for? Don't even to say that we will add TSO support, which modifies few more files, such as csum_start: reseting the first two fields only is wrong here. --yliu
[dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode
2016-01-27 10:08, Burakov, Anatoly: > > Why a new file for these functions? > > Well, my thought was to make future extensions easier by way of avoiding > mixing irrelevant and/or general code with driver-specific code. I can change > it back if that's not OK. No strong opinion here. David?
[dpdk-dev] bnx2x driver and 57800 versus 57810
On Wed, 2016-01-27 at 07:32 +, Harish Patil wrote: > > > >I have to practically identical systems, same hypervisor on each > (Centos > >7.x).??In one, I have a 57800 card which works fine with DPDK with > >SRIOV.??In the other, I have a 57810 card which doesn't work with > SRIOV. > > > >For the 57810 I have tracked this down to the status block in the VF > >failing to be updated.??The linux driver works fine but it appears to > >use a slightly different scheme -- writing some sort of fastpath > status > >block generation per interrupt. > > > >Does anyone have any suggestions or a programming guide for this > device? > > > > >? > What is not working with 57810? Is it link related or traffic? Please > provide the details. > Attached is the SW programming guide for 577xx/578xx. I?m not sure if > it has details pertaining to the specific issue that you have. The DPDK PMD driver seems to be able to transmit packets on the 57810. But since the status block isn't getting updated, you can't reclaim the sent buffers.??I modified the driver to use the marker based receive detection (similar to the method used in the Linux driver) and I can see packets getting received (certainly broadcast is received -- possibly not unicast packets though which seems to indicate that part of the RX path is possibly still broken). I have tried a couple things.??The status page in the DPDK PMD driver isn't getting page aligned (as well as a bunch of other structures that should probably be page aligned). The Linux driver happens to do this as a side effect of the DMA allocator.??Fixing this didn't seem to improve matters though.??The status block doesn't seem to get updated. I verified that the correct DMA address is getting passed to the PF. And since it works on the 57800, I thought perhaps something changed. Also, the DPDK driver probably gets the RX/TX queue indices wrong during initial setup.??The final values coming out of the allocation loop are probably bigger than they should be.??Should they point to the end of the queue or just past the end???Also, the tail of the queue needs to be corrected for the double entry at the end of the pages.??Again, fixing this didn't seem to help either. The VF-PF interaction seems to be ok as well.??Other than not supporting SGE, the DPDK PMD driver seems to send reasonably correct messages to the PF. I don't see the DPDK PMD driver doing anything to 'reset' the PCI apsect of the VF.??If there is any left over configuration for interrupts, like leaving the IGU enabled that maybe not be cleared, I am not sure what the interaction might be.??I do know the Linux driver does seem to use MSI-X interrupts. > Thanks, > Harish Thanks for looking at this and thanks for the programming guide.??It will take me a bit to digest it.
[dpdk-dev] [PATCH v2 1/2] ethdev: remove useless null checks
On Tue, Jan 26, 2016 at 4:50 PM, Jan Viktorin wrote: > What about the RTE_VERIFY? I think, it's more appropriate here. Well, here, I am removing useless checks in static functions. But for the rest of ethdev api, I agree we could add some RTE_VERIFY. > Otherwise, feel free to add: > > Reviewed-by: Jan Viktorin Thanks. -- David Marchand
[dpdk-dev] [PATCH] eal: add function to check if primary proc alive
> From: Richardson, Bruce > > Agreed, however hiding it totally removes the flexibility of waiting for a > > primary > > that is starting with --file-prefix (aka: in a non-default location). > > Imposing > > a limit on only monitoring primary procs in the default location seems > > wrong. > > But the secondary also needs the same prefix. Is that prefix not accessible by > this function to be used? The issue is that the EAL parsing code is performed during rte_init(), which is exactly what this function tries to avoid - initializing EAL before a primary process starts. I looked at changing the EAL parsing to come before rte_init(), and considered adding a minimal parser for --file-prefix. Both routes seem a bad solution, either for complexity or code-duplication. v2 of this patch posted to list: http://dpdk.org/dev/patchwork/patch/10126/ -Harry
[dpdk-dev] [PATCH] log: add missing symbol
2015-12-16 16:38, Stephen Hemminger: > rte_get_log_type and rte_get_log_level functions has been avaliable > for many versions. But they are missing from the shared library map > and therefore do not get exported correctly. > > Signed-off-by: Stephen Hemminger > --- > lib/librte_eal/linuxapp/eal/rte_eal_version.map | 2 ++ > 1 file changed, 2 insertions(+) Why only in linuxapp? > diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map > b/lib/librte_eal/linuxapp/eal/rte_eal_version.map > index cbe175f..51a241c 100644 > --- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map > +++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map > @@ -93,7 +93,9 @@ DPDK_2.0 { > rte_realloc; > rte_set_application_usage_hook; > rte_set_log_level; > + rte_get_log_level; > rte_set_log_type; > + rte_get_log_type; We try to keep an alphabetical order :)
[dpdk-dev] [PATCH v5 01/11] virtio: Introduce config RTE_VIRTIO_INC_VECTOR
On Wed, Jan 27, 2016 at 07:53:21AM +0530, Santosh Shukla wrote: > Ping? I was on vacation late last week. And I was quite busy till now after the vacation. So, sorry that I still don't have time to do more detailed reviews in 1 or 2 days. Hopefully I can make it by this Friday. BTW, I had a very glimpse of this patchset, overall, it looks much better now, except the EAL changes (I'm not the maintainer) and the virtio io port read/write stuff: Tetsuay suggested to add another access wraps, but I have few concerns about that. Anyway, I don't have time for deeper thoughts, and I will re-think it later. --yliu
[dpdk-dev] [PATCH v2] eal: add function to check if primary proc alive
This patch adds a new function to the EAL API: int rte_eal_primary_proc_alive(const char *path); The function indicates if a primary process is alive right now. This functionality is implemented by testing for a write- lock on the config file, and the function tests for a lock. The use case for this functionality is that a secondary process can wait until a primary process starts by polling the function and waiting. When the primary is running, the secondary continues to poll to detect if the primary process has quit unexpectedly, the secondary process can detect this. The RTE_MAGIC number is written to the shared config by the primary process, this is the signal to the secondary process that the EAL is set up, and ready to be used. The function rte_eal_mcfg_complete() writes RTE_MAGIC. This has been delayed in the EAL init proceedure, as the PCI probing in the primary process can interfere with the secondary running. Signed-off-by: Harry van Haaren --- v2: - Passing NULL as const char* uses default /var/run/.rte_config - Moved code into /common/ instead of /linuxapp/, should work on BSD now doc/guides/rel_notes/release_2_3.rst| 7 +++ lib/librte_eal/bsdapp/eal/Makefile | 1 + lib/librte_eal/bsdapp/eal/rte_eal_version.map | 8 lib/librte_eal/common/eal_common_proc.c | 61 + lib/librte_eal/common/include/rte_eal.h | 18 lib/librte_eal/linuxapp/eal/Makefile| 1 + lib/librte_eal/linuxapp/eal/eal.c | 4 +- lib/librte_eal/linuxapp/eal/rte_eal_version.map | 7 +++ 8 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 lib/librte_eal/common/eal_common_proc.c diff --git a/doc/guides/rel_notes/release_2_3.rst b/doc/guides/rel_notes/release_2_3.rst index 99de186..14b5b06 100644 --- a/doc/guides/rel_notes/release_2_3.rst +++ b/doc/guides/rel_notes/release_2_3.rst @@ -11,6 +11,13 @@ Resolved Issues EAL ~~~ +* **Added rte_eal_primary_proc_alive() function** + + A new function ``rte_eal_primary_proc_alive()`` has been added + to allow the user to detect if a primary process is running. + Use cases for this feature include fault detection, and monitoring + using secondary processes. + Drivers ~~~ diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile index 65b293f..2d6e3b1 100644 --- a/lib/librte_eal/bsdapp/eal/Makefile +++ b/lib/librte_eal/bsdapp/eal/Makefile @@ -61,6 +61,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_alarm.c # from common dir SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_common_lcore.c +SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_common_proc.c SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_common_timer.c SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_common_memzone.c SRCS-$(CONFIG_RTE_LIBRTE_EAL_BSDAPP) += eal_common_log.c diff --git a/lib/librte_eal/bsdapp/eal/rte_eal_version.map b/lib/librte_eal/bsdapp/eal/rte_eal_version.map index 9d7adf1..0e28017 100644 --- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map +++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map @@ -135,3 +135,11 @@ DPDK_2.2 { rte_xen_dom0_supported; } DPDK_2.1; + + +DPDK_2.3 { + global: + + rte_eal_primary_proc_alive; + +} DPDK_2.2; diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c new file mode 100644 index 000..c598891 --- /dev/null +++ b/lib/librte_eal/common/eal_common_proc.c @@ -0,0 +1,61 @@ +/*- + * BSD LICENSE + * + * Copyright 2016 Intel Shannon Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
[dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode
> >> > Why a new file for these functions? > >> > >> Well, my thought was to make future extensions easier by way of > avoiding mixing irrelevant and/or general code with driver-specific code. I > can > change it back if that's not OK. > > > > No strong opinion here. > > David? > > Hum, no strong opinion either, but I don't think we really need to split this > file for this much code. > Besides, if we keep all code in eal_pci_vfio.c, there is no need to expose > those structures through eal_pci_init.h. OK then, I'll merge it back into the eal_pci_vfio.c Thanks, Anatoly
[dpdk-dev] [PATCH] ethdev: fix byte order inconsistence between fdir flow and mask
2016-01-27 16:37, Jingjing Wu: > Fixed issue of byte order in ethdev library that the structure > for setting fdir's mask and flow entry is inconsist and made > inputs of mask be in big endian. Please be more precise. Which one is big endian? Wasn't it tested before? > fixes: 76c6f89e80d4 ("ixgbe: support new flow director masks") >2d4c1a9ea2ac ("ethdev: add new flow director masks") Please put Fixes: on the two lines. > --- a/doc/guides/rel_notes/release_2_3.rst > +++ b/doc/guides/rel_notes/release_2_3.rst > @@ -19,6 +19,10 @@ Drivers > Libraries > ~ > > +* ** fix byte order inconsistence between fdir flow and mask ** > + > + Fixed issue in ethdev library that the structure for setting > + fdir's mask and flow entry is inconsist in byte order. John, comment on release notes formatting? It's important to have the first items well formatted. > @@ -39,6 +43,8 @@ API Changes > ABI Changes > --- > > +* The fields in The ethdev structures ``rte_eth_fdir_masks`` were > + changed to be in big endian. Please take care of uppercase typo here. > - /* write all the same so that UDP, TCP and SCTP use the same mask */ > + /* write all the same so that UDP, TCP and SCTP use the same mask > + * (little-endian) > + */ Spacing typo here. Sorry for the nits ;) > - uint8_t mac_addr_byte_mask; /** Per byte MAC address mask */ > + uint8_t mac_addr_byte_mask; /** Bit mask for associated byte */ > uint32_t tunnel_id_mask; /** tunnel ID mask */ > - uint8_t tunnel_type_mask; > + uint8_t tunnel_type_mask; /**< 1 - Match tunnel type, > +0 - Ignore tunnel type. */ These changes seem unrelated with the patch. It's good to improve doc of this API but it's maybe not enough. Example: uint8_t mac_addr_byte_mask; /** Bit mask for associated byte */ Are we sure everybody understand how to fill it?
[dpdk-dev] [PATCH] ip_pipeline: add load balancing function to pass-through pipeline
The passthrough pipeline implementation is extended with load balancing function. This function allows uniform distribution of the packets among its output ports. For packets distribution, any application level logic can be applied. For instance, in this implementation, hash value computed over specific header fields of the incoming packets has been used to spread traffic uniformly among the output ports. The following passthrough configuration can be used for implementing load balancing function over ipv4 traffic; [PIPELINE0] type = PASS-THROUGH core = 0 pktq_in = RXQ0.0 RXQ1.0 RXQ2.0 RXQ3.0 pktq_out = TXQ0.0 TXQ1.0 TXQ2.0 TXQ3.0 dma_src_offset = 278; mbuf (128) + headroom (128) + 1st ethertype offset (14) + ttl offset within ip header = 278 (ipv4) dma_dst_offset = 128; mbuf (128) dma_size = 16 dma_src_mask = 00FF dma_hash_offset = 144; (dma_dst_offset+dma_size) lb = hash Signed-off-by: Jasvinder Singh Acked-by: Cristian Dumitrescu --- .../ip_pipeline/pipeline/pipeline_actions_common.h | 22 ++ .../ip_pipeline/pipeline/pipeline_passthrough_be.c | 281 - .../ip_pipeline/pipeline/pipeline_passthrough_be.h | 2 + 3 files changed, 245 insertions(+), 60 deletions(-) diff --git a/examples/ip_pipeline/pipeline/pipeline_actions_common.h b/examples/ip_pipeline/pipeline/pipeline_actions_common.h index 9958758..2c08db2 100644 --- a/examples/ip_pipeline/pipeline/pipeline_actions_common.h +++ b/examples/ip_pipeline/pipeline/pipeline_actions_common.h @@ -59,6 +59,28 @@ f_ah( \ return 0; \ } +#define PIPELINE_PORT_IN_AH_LB(f_ah, f_pkt_work, f_pkt4_work) \ +static int \ +f_ah( \ + struct rte_pipeline *p, \ + struct rte_mbuf **pkts, \ + uint32_t n_pkts,\ + void *arg) \ +{ \ + uint32_t i; \ + \ + uint64_t pkt_mask = RTE_LEN2MASK(n_pkts, uint64_t); \ + \ + rte_pipeline_ah_packet_hijack(p, pkt_mask); \ + for (i = 0; i < (n_pkts & (~0x3LLU)); i += 4) \ + f_pkt4_work([i], arg); \ + \ + for ( ; i < n_pkts; i++)\ + f_pkt_work(pkts[i], arg); \ + \ + return 0; \ +} + #define PIPELINE_TABLE_AH_HIT(f_ah, f_pkt_work, f_pkt4_work) \ static int \ f_ah( \ diff --git a/examples/ip_pipeline/pipeline/pipeline_passthrough_be.c b/examples/ip_pipeline/pipeline/pipeline_passthrough_be.c index 7642462..75b6fd8 100644 --- a/examples/ip_pipeline/pipeline/pipeline_passthrough_be.c +++ b/examples/ip_pipeline/pipeline/pipeline_passthrough_be.c @@ -72,7 +72,9 @@ pkt_work( struct rte_mbuf *pkt, void *arg, uint32_t dma_size, - uint32_t hash_enabled) + uint32_t hash_enabled, + uint32_t lb_hash, + uint32_t port_out_pw2) { struct pipeline_passthrough *p = arg; @@ -90,8 +92,24 @@ pkt_work( dma_dst[i] = dma_src[i] & dma_mask[i]; /* Read (dma_dst), compute (hash), write (hash) */ - if (hash_enabled) - *dma_hash = p->f_hash(dma_dst, dma_size, 0); + if (hash_enabled) { + uint32_t hash = p->f_hash(dma_dst, dma_size, 0); + *dma_hash = hash; + + if (lb_hash) { + uint32_t port_out; + + if (port_out_pw2) + port_out + = hash & (p->p.n_ports_out - 1); + else + port_out + = hash % p->p.n_ports_out; + + rte_pipeline_port_out_packet_insert(p->p.p, + port_out, pkt); + } + } } static inline __attribute__((always_inline)) void @@ -99,7 +117,9 @@ pkt4_work( struct rte_mbuf **pkts, void *arg, uint32_t dma_size, - uint32_t hash_enabled) + uint32_t hash_enabled, + uint32_t lb_hash, +
[dpdk-dev] [PATCH v2] vfio: Support for no-IOMMU mode
Hi Anatoly, Few small comments. The comments "function pointer typedef" or "structure to hold" don't bring new information. Please keep it short. 2016-01-13 12:36, Anatoly Burakov: > +/* function pointer typedef for DMA mapping functions */ -> DMA mapping function type It would be relevant to describe the return and the parameter. > +typedef int (*vfio_dma_func_t)(int); > + > +/* Structure to hold supported IOMMU types */ This comment seems useless. > +struct vfio_iommu_type { [...] > +/* function prototypes for different IOMMU types */ idem > +int vfio_iommu_type1_dma_map(int container_fd); > +int vfio_iommu_noiommu_dma_map(int container_fd); > + > +/* IOMMU types we support */ > +static const struct vfio_iommu_type iommu_types[] = { > + /* x86 IOMMU, otherwise known as type 1 */ > + { VFIO_TYPE1_IOMMU, "Type 1", _iommu_type1_dma_map}, > + /* IOMMU-less mode */ > + { VFIO_NOIOMMU_IOMMU, "No-IOMMU", _iommu_noiommu_dma_map}, > +}; [...] > --- /dev/null > +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_dma.c Why a new file for these functions?
[dpdk-dev] [RFC PATCH 5/5] virtio: Extend virtio-net PMD to support container environment
On 1/21/2016 7:09 PM, Tetsuya Mukawa wrote: > + /* Set BAR region */ > + for (i = 0; i < NB_BAR; i++) { > + switch (dev->bar[i].type) { > + case QTEST_PCI_BAR_IO: > + case QTEST_PCI_BAR_MEMORY_UNDER_1MB: > + case QTEST_PCI_BAR_MEMORY_32: > + qtest_pci_outl(s, bus, device, 0, dev->bar[i].addr, > + dev->bar[i].region_start); > + PMD_DRV_LOG(INFO, "Set BAR of %s device: 0x%lx - > 0x%lx\n", > + dev->name, dev->bar[i].region_start, > + dev->bar[i].region_start + > dev->bar[i].region_size); > + break; > + case QTEST_PCI_BAR_MEMORY_64: > + qtest_pci_outq(s, bus, device, 0, dev->bar[i].addr, > + dev->bar[i].region_start); > + PMD_DRV_LOG(INFO, "Set BAR of %s device: 0x%lx - > 0x%lx\n", > + dev->name, dev->bar[i].region_start, > + dev->bar[i].region_start + > dev->bar[i].region_size); > + break; Hasn't the bar resource already been allocated? Is it the app's responsibility to allocate the bar resource in qtest mode? The app couldn't have that knowledge. > + case QTEST_PCI_BAR_DISABLE: > + break; > + } > + } > +
[dpdk-dev] [PATCH v2 2/2] i40evf: support interrupt based pf reset request
Interrupt based request of PF reset from PF is supported by enabling the adminq event process in VF driver. Users can register a callback for this interrupt event to get informed, when a PF reset request detected like: rte_eth_dev_callback_register(portid, RTE_ETH_EVENT_INTR_RESET, reset_event_callback, arg); Signed-off-by: Jingjing Wu --- doc/guides/rel_notes/release_2_3.rst | 1 + drivers/net/i40e/i40e_ethdev_vf.c| 274 +++ lib/librte_ether/rte_ethdev.h| 1 + 3 files changed, 246 insertions(+), 30 deletions(-) diff --git a/doc/guides/rel_notes/release_2_3.rst b/doc/guides/rel_notes/release_2_3.rst index 99de186..73d5f76 100644 --- a/doc/guides/rel_notes/release_2_3.rst +++ b/doc/guides/rel_notes/release_2_3.rst @@ -4,6 +4,7 @@ DPDK Release 2.3 New Features +* **Added pf reset event reported in i40e vf PMD driver. Resolved Issues --- diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index 64e6957..1ffe64e 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -74,8 +74,6 @@ #define I40EVF_BUSY_WAIT_DELAY 10 #define I40EVF_BUSY_WAIT_COUNT 50 #define MAX_RESET_WAIT_CNT 20 -/*ITR index for NOITR*/ -#define I40E_QINT_RQCTL_MSIX_INDX_NOITR 3 struct i40evf_arq_msg_info { enum i40e_virtchnl_ops ops; @@ -151,6 +149,9 @@ static int i40evf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id); static int i40evf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id); +static void i40evf_handle_pf_event(__rte_unused struct rte_eth_dev *dev, + uint8_t *msg, + uint16_t msglen); /* Default hash key buffer for RSS */ static uint32_t rss_key_default[I40E_VFQF_HKEY_MAX_INDEX + 1]; @@ -357,20 +358,42 @@ i40evf_execute_vf_cmd(struct rte_eth_dev *dev, struct vf_cmd_info *args) return err; } - do { - /* Delay some time first */ - rte_delay_ms(ASQ_DELAY_MS); - ret = i40evf_read_pfmsg(dev, ); - if (ret == I40EVF_MSG_CMD) { - err = 0; - break; - } else if (ret == I40EVF_MSG_ERR) { - err = -1; - break; - } - /* If don't read msg or read sys event, continue */ - } while (i++ < MAX_TRY_TIMES); - _clear_cmd(vf); + switch (args->ops) { + case I40E_VIRTCHNL_OP_RESET_VF: + /*no need to process in this function */ + break; + case I40E_VIRTCHNL_OP_VERSION: + case I40E_VIRTCHNL_OP_GET_VF_RESOURCES: + /* for init adminq commands, need to poll the response */ + do { + /* Delay some time first */ + rte_delay_ms(ASQ_DELAY_MS); + ret = i40evf_read_pfmsg(dev, ); + if (ret == I40EVF_MSG_CMD) { + err = 0; + break; + } else if (ret == I40EVF_MSG_ERR) { + err = -1; + break; + } + /* If don't read msg or read sys event, continue */ + } while (i++ < MAX_TRY_TIMES); + _clear_cmd(vf); + break; + + default: + /* for other adminq in running time, waiting the cmd done flag */ + do { + /* Delay some time first */ + rte_delay_ms(ASQ_DELAY_MS); + if (vf->pend_cmd == I40E_VIRTCHNL_OP_UNKNOWN) { + err = 0; + break; + } + /* If don't read msg or read sys event, continue */ + } while (i++ < MAX_TRY_TIMES); + break; + } return (err | vf->cmd_retval); } @@ -719,7 +742,7 @@ i40evf_config_irq_map(struct rte_eth_dev *dev) map_info = (struct i40e_virtchnl_irq_map_info *)cmd_buffer; map_info->num_vectors = 1; - map_info->vecmap[0].rxitr_idx = I40E_QINT_RQCTL_MSIX_INDX_NOITR; + map_info->vecmap[0].rxitr_idx = I40E_ITR_INDEX_DEFAULT; map_info->vecmap[0].vsi_id = vf->vsi_res->vsi_id; /* Alway use default dynamic MSIX interrupt */ map_info->vecmap[0].vector_id = vector_id; @@ -1093,6 +1116,38 @@ i40evf_dev_atomic_write_link_status(struct rte_eth_dev *dev, return 0; } +/* Disable IRQ0 */ +static inline void +i40evf_disable_irq0(struct i40e_hw *hw) +{ + /* Disable all interrupt types */ + I40E_WRITE_REG(hw, I40E_VFINT_ICR0_ENA1, 0); + I40E_WRITE_REG(hw, I40E_VFINT_DYN_CTL01, +
[dpdk-dev] [PATCH v2 1/2] i40evf: allocate virtchnl cmd buffer for each vf
Currently, i40evf PMD uses a global static buffer to send virtchnl command to host driver. It is shared by multi VFs. This patch changed to allocate virtchnl cmd buffer for each VF. Signed-off-by: Jingjing Wu --- drivers/net/i40e/i40e_ethdev.h| 2 + drivers/net/i40e/i40e_ethdev_vf.c | 181 +++--- 2 files changed, 74 insertions(+), 109 deletions(-) diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h index 1f9792b..93122ad 100644 --- a/drivers/net/i40e/i40e_ethdev.h +++ b/drivers/net/i40e/i40e_ethdev.h @@ -494,7 +494,9 @@ struct i40e_vf { bool link_up; bool vf_reset; volatile uint32_t pend_cmd; /* pending command not finished yet */ + uint32_t cmd_retval; /* return value of the cmd response from PF */ u16 pend_msg; /* flags indicates events from pf not handled yet */ + uint8_t *aq_resp; /* buffer to store the adminq response from PF */ /* VSI info */ struct i40e_virtchnl_vf_resource *vf_res; /* All VSIs */ diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index 14d2a50..64e6957 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -103,9 +103,6 @@ enum i40evf_aq_result { I40EVF_MSG_CMD, /* Read async command result */ }; -/* A share buffer to store the command result from PF driver */ -static uint8_t cmd_result_buffer[I40E_AQ_BUF_SZ]; - static int i40evf_dev_configure(struct rte_eth_dev *dev); static int i40evf_dev_start(struct rte_eth_dev *dev); static void i40evf_dev_stop(struct rte_eth_dev *dev); @@ -237,31 +234,39 @@ i40evf_set_mac_type(struct i40e_hw *hw) } /* - * Parse admin queue message. - * - * return value: - * < 0: meet error - * 0: read sys msg - * > 0: read cmd result + * Read data in admin queue to get msg from pf driver */ static enum i40evf_aq_result -i40evf_parse_pfmsg(struct i40e_vf *vf, - struct i40e_arq_event_info *event, - struct i40evf_arq_msg_info *data) +i40evf_read_pfmsg(struct rte_eth_dev *dev, struct i40evf_arq_msg_info *data) { - enum i40e_virtchnl_ops opcode = (enum i40e_virtchnl_ops)\ - rte_le_to_cpu_32(event->desc.cookie_high); - enum i40e_status_code retval = (enum i40e_status_code)\ - rte_le_to_cpu_32(event->desc.cookie_low); - enum i40evf_aq_result ret = I40EVF_MSG_CMD; + struct i40e_hw *hw = I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private); + struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); + struct i40e_arq_event_info event; + enum i40e_virtchnl_ops opcode; + enum i40e_status_code retval; + int ret; + enum i40evf_aq_result result = I40EVF_MSG_NON; + event.buf_len = data->buf_len; + event.msg_buf = data->msg; + ret = i40e_clean_arq_element(hw, , NULL); + /* Can't read any msg from adminQ */ + if (ret) { + if (ret == I40E_ERR_ADMIN_QUEUE_NO_WORK) + result = I40EVF_MSG_NON; + else + result = I40EVF_MSG_ERR; + return result; + } + + opcode = (enum i40e_virtchnl_ops)rte_le_to_cpu_32(event.desc.cookie_high); + retval = (enum i40e_status_code)rte_le_to_cpu_32(event.desc.cookie_low); /* pf sys event */ if (opcode == I40E_VIRTCHNL_OP_EVENT) { struct i40e_virtchnl_pf_event *vpe = - (struct i40e_virtchnl_pf_event *)event->msg_buf; + (struct i40e_virtchnl_pf_event *)event.msg_buf; - /* Initialize ret to sys event */ - ret = I40EVF_MSG_SYS; + result = I40EVF_MSG_SYS; switch (vpe->event) { case I40E_VIRTCHNL_EVENT_LINK_CHANGE: vf->link_up = @@ -286,74 +291,17 @@ i40evf_parse_pfmsg(struct i40e_vf *vf, } } else { /* async reply msg on command issued by vf previously */ - ret = I40EVF_MSG_CMD; + result = I40EVF_MSG_CMD; /* Actual data length read from PF */ - data->msg_len = event->msg_len; + data->msg_len = event.msg_len; } - /* fill the ops and result to notify VF */ + data->result = retval; data->ops = opcode; - return ret; -} - -/* - * Read data in admin queue to get msg from pf driver - */ -static enum i40evf_aq_result -i40evf_read_pfmsg(struct rte_eth_dev *dev, struct i40evf_arq_msg_info *data) -{ - struct i40e_hw *hw = I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private); - struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private); - struct i40e_arq_event_info event; - int ret; - enum i40evf_aq_result result = I40EVF_MSG_NON; - - event.buf_len = data->buf_len; - event.msg_buf = data->msg; - ret =