[dpdk-dev] [PATCH 1/3] librte_reorder: New reorder library

2015-01-08 Thread Richard Sanger
On 8 January 2015 at 05:39, Reshma Pattan  wrote:
> From: Reshma Pattan 
>
> 1)New library to provide reordering of out of ordered
> mbufs based on sequence number of mbuf. Library uses reorder 
> buffer structure
> which in tern uses two circular buffers called ready and order 
> buffers.
> *rte_reorder_create API creates instance of reorder buffer.
> *rte_reorder_init API initializes given reorder buffer instance.
> *rte_reorder_reset API resets given reorder buffer instance.
> *rte_reorder_insert API inserts the mbuf into order circular 
> buffer.
> *rte_reorder_fill_overflow moves mbufs from order buffer to ready 
> buffer
> to accomodate early packets in order buffer.
> *rte_reorder_drain API provides draining facility to fetch out
> reordered mbufs from order and ready buffers.
>
> Signed-off-by: Reshma Pattan 
> Signed-off-by: Richardson Bruce 
> ---
>  config/common_bsdapp   |   5 +
>  config/common_linuxapp |   5 +
>  lib/Makefile   |   1 +
>  lib/librte_eal/common/include/rte_tailq_elem.h |   2 +
>  lib/librte_mbuf/rte_mbuf.h |   3 +
>  lib/librte_reorder/Makefile|  50 +++
>  lib/librte_reorder/rte_reorder.c   | 464 
> +
>  lib/librte_reorder/rte_reorder.h   | 184 ++
>  8 files changed, 714 insertions(+)
>  create mode 100644 lib/librte_reorder/Makefile
>  create mode 100644 lib/librte_reorder/rte_reorder.c
>  create mode 100644 lib/librte_reorder/rte_reorder.h
>
> diff --git a/config/common_bsdapp b/config/common_bsdapp
> index 9177db1..e3e0e94 100644
> --- a/config/common_bsdapp
> +++ b/config/common_bsdapp
> @@ -334,6 +334,11 @@ CONFIG_RTE_SCHED_PORT_N_GRINDERS=8
>  CONFIG_RTE_LIBRTE_DISTRIBUTOR=y
>
>  #
> +# Compile the reorder library
> +#
> +CONFIG_RTE_LIBRTE_REORDER=y
> +
> +#
>  # Compile librte_port
>  #
>  CONFIG_RTE_LIBRTE_PORT=y
> diff --git a/config/common_linuxapp b/config/common_linuxapp
> index 2f9643b..b5ec730 100644
> --- a/config/common_linuxapp
> +++ b/config/common_linuxapp
> @@ -342,6 +342,11 @@ CONFIG_RTE_SCHED_PORT_N_GRINDERS=8
>  CONFIG_RTE_LIBRTE_DISTRIBUTOR=y
>
>  #
> +# Compile the reorder library
> +#
> +CONFIG_RTE_LIBRTE_REORDER=y
> +
> +#
>  # Compile librte_port
>  #
>  CONFIG_RTE_LIBRTE_PORT=y
> diff --git a/lib/Makefile b/lib/Makefile
> index 0ffc982..5919d32 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -65,6 +65,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += librte_distributor
>  DIRS-$(CONFIG_RTE_LIBRTE_PORT) += librte_port
>  DIRS-$(CONFIG_RTE_LIBRTE_TABLE) += librte_table
>  DIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += librte_pipeline
> +DIRS-$(CONFIG_RTE_LIBRTE_REORDER) += librte_reorder
>
>  ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
>  DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
> diff --git a/lib/librte_eal/common/include/rte_tailq_elem.h 
> b/lib/librte_eal/common/include/rte_tailq_elem.h
> index f74fc7c..3013869 100644
> --- a/lib/librte_eal/common/include/rte_tailq_elem.h
> +++ b/lib/librte_eal/common/include/rte_tailq_elem.h
> @@ -84,6 +84,8 @@ rte_tailq_elem(RTE_TAILQ_ACL, "RTE_ACL")
>
>  rte_tailq_elem(RTE_TAILQ_DISTRIBUTOR, "RTE_DISTRIBUTOR")
>
> +rte_tailq_elem(RTE_TAILQ_REORDER, "RTE_REORDER")
> +
>  rte_tailq_end(RTE_TAILQ_NUM)
>
>  #undef rte_tailq_elem
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 16059c6..ed27eb8 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -262,6 +262,9 @@ struct rte_mbuf {
> uint32_t usr; /**< User defined tags. See 
> @rte_distributor_process */
> } hash;   /**< hash information */
>
> +   /* sequence number - field used in distributor and reorder library */
> +   uint32_t seqn;
> +
> /* second cache line - fields only used in slow path or on TX */
> MARKER cacheline1 __rte_cache_aligned;
>
> diff --git a/lib/librte_reorder/Makefile b/lib/librte_reorder/Makefile
> new file mode 100644
> index 000..12b916f
> --- /dev/null
> +++ b/lib/librte_reorder/Makefile
> @@ -0,0 +1,50 @@
> +#   BSD LICENSE
> +#
> +#   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> +#   All rights reserved.
> +#
> +#   Redistribution and use in source and binary forms, with or without
> +#   modification, are permitted provided that the following conditions
> +#   are met:
> +#
> +# * Redistributions of source code must retain the above copyright
> +#   notice, this list of conditions and the following disclaimer.
> +# * Redistributions in binary form must reproduce the above copyright
> +#   notice, this list of conditions and the following disclaimer in
> +#   the documentation and/or other materials provided with the
> +#   distribution.
> +# * 

[dpdk-dev] [PATCH] ixgbe/igb: integrate syn filter to new API

2015-01-08 Thread zhida zang
This patch integrates syn filter to new API in ixgbe/igb driver.

changes:
ixgbe: remove old functions that deal with syn filter
ixgbe: add new functions that deal with syn filter (fit for filter_ctrl API)
e1000: remove old functions that deal with syn filter
e1000: add new functions that deal with syn filter (fit for filter_ctrl API)
testpmd: change the entry for syn filter in cmdline
testpmd: change function call to get syn filter in config

Signed-off-by: Zhida Zang 
---
 app/test-pmd/cmdline.c  | 179 ---
 app/test-pmd/config.c   |  10 +-
 lib/librte_ether/rte_eth_ctrl.h |  12 +++
 lib/librte_pmd_e1000/igb_ethdev.c   | 176 --
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c | 182 
 5 files changed, 331 insertions(+), 228 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 4c3fc76..820b3a6 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -665,13 +665,13 @@ static void cmd_help_long_parsed(void *parsed_result,
"get_5tuple_filter (port_id) index (idx)\n"
"get info of a 5tuple filter.\n\n"

-   "add_syn_filter (port_id) priority (high|low) queue 
(queue_id)"
+   "syn_filter add (port_id) priority (high|low) queue 
(queue_id)"
"add syn filter.\n\n"

-   "remove_syn_filter (port_id)"
+   "syn_filter del (port_id)"
"remove syn filter.\n\n"

-   "get_syn_filter (port_id) "
+   "syn_filter get (port_id) "
"get syn filter info.\n\n"

"add_flex_filter (port_id) len (len_value) bytes 
(bytes_string) mask (mask_value)"
@@ -7044,101 +7044,130 @@ cmdline_parse_inst_t cmd_get_ethertype_filter = {
},
 };

-/* *** set SYN filter *** */
-struct cmd_set_syn_filter_result {
+/* *** Add/Del/Get syn filter *** */
+struct cmd_syn_filter_result {
cmdline_fixed_string_t filter;
+   cmdline_fixed_string_t  ops;
uint8_t port_id;
cmdline_fixed_string_t priority;
cmdline_fixed_string_t high;
cmdline_fixed_string_t queue;
-   uint16_t  queue_id;
+   uint16_t queue_id;
 };

-static void
-cmd_set_syn_filter_parsed(void *parsed_result,
-   __attribute__((unused)) struct cmdline *cl,
-   __attribute__((unused)) void *data)
-{
-   int ret = 0;
-   struct cmd_set_syn_filter_result *res = parsed_result;
-   struct rte_syn_filter filter;
-
-   if (!strcmp(res->filter, "add_syn_filter")) {
-   if (!strcmp(res->high, "high"))
-   filter.hig_pri = 1;
-   else
-   filter.hig_pri = 0;
-   ret = rte_eth_dev_add_syn_filter(res->port_id,
-   , res->queue_id);
-   } else if (!strcmp(res->filter, "remove_syn_filter"))
-   ret = rte_eth_dev_remove_syn_filter(res->port_id);
-   else if (!strcmp(res->filter, "get_syn_filter"))
-   get_syn_filter(res->port_id);
-   if (ret < 0)
-   printf("syn filter setting error: (%s)\n", strerror(-ret));
-
-}
-cmdline_parse_token_num_t cmd_syn_filter_portid =
-   TOKEN_NUM_INITIALIZER(struct cmd_set_syn_filter_result,
-   port_id, UINT8);
+cmdline_parse_token_string_t cmd_syn_filter_filter =
+   TOKEN_STRING_INITIALIZER(struct cmd_syn_filter_result,
+   filter, "syn_filter");
+cmdline_parse_token_string_t cmd_syn_filter_add =
+   TOKEN_STRING_INITIALIZER(struct cmd_syn_filter_result,
+   ops, "add");
+cmdline_parse_token_string_t cmd_syn_filter_del =
+   TOKEN_STRING_INITIALIZER(struct cmd_syn_filter_result,
+   ops, "del");
+cmdline_parse_token_string_t cmd_syn_filter_get =
+   TOKEN_STRING_INITIALIZER(struct cmd_syn_filter_result,
+   ops, "get");
+cmdline_parse_token_num_t cmd_syn_filter_port_id =
+   TOKEN_NUM_INITIALIZER(struct cmd_syn_filter_result,
+   port_id, UINT8);
 cmdline_parse_token_string_t cmd_syn_filter_priority =
-   TOKEN_STRING_INITIALIZER(struct cmd_set_syn_filter_result,
+   TOKEN_STRING_INITIALIZER(struct cmd_syn_filter_result,
priority, "priority");
 cmdline_parse_token_string_t cmd_syn_filter_high =
-   TOKEN_STRING_INITIALIZER(struct cmd_set_syn_filter_result,
+   TOKEN_STRING_INITIALIZER(struct cmd_syn_filter_result,
high, "high#low");
 cmdline_parse_token_string_t cmd_syn_filter_queue =
-   TOKEN_STRING_INITIALIZER(struct cmd_set_syn_filter_result,
+   TOKEN_STRING_INITIALIZER(struct cmd_syn_filter_result,
queue, "queue");
 cmdline_parse_token_num_t cmd_syn_filter_queue_id =
-   

[dpdk-dev] [PATCH v3] i40e: workaround for X710 performance issues

2015-01-08 Thread Wu, Jingjing


> -Original Message-
> From: Zhang, Helin
> Sent: Tuesday, December 16, 2014 4:23 PM
> To: dev at dpdk.org
> Cc: Chen, Jing D; Wu, Jingjing; Liu, Jijiang; Cao, Waterman; Lu, Patrick;
> Rowden, Aaron F; Zhang, Helin
> Subject: [PATCH v3] i40e: workaround for X710 performance issues
> 
> On X710, performance number is far from the expectation on recent
> firmware versions. The fix for this issue may not be integrated in the
> following firmware version. So the workaround in software driver is needed.
> It needs to modify the initial values of 3 internal only registers. Note that 
> the
> workaround can be removed when it is fixed in firmware in the future.
> 
> Signed-off-by: Helin Zhang 
> ---
>  lib/librte_pmd_i40e/i40e_ethdev.c | 89
> +++
>  1 file changed, 89 insertions(+)
> 
> v2 changes:
> * Added a compile error fix.
> 
> v3 changes:
> * Used PRIx32 and PRIx64 instead for printing uint32_t and uint64_t
>   variables.
> * Re-worded annotations, and commit logs.
> 
> diff --git a/lib/librte_pmd_i40e/i40e_ethdev.c
> b/lib/librte_pmd_i40e/i40e_ethdev.c
> index 008d62c..624f0ce 100644
> --- a/lib/librte_pmd_i40e/i40e_ethdev.c
> +++ b/lib/librte_pmd_i40e/i40e_ethdev.c
> @@ -198,6 +198,7 @@ static int i40e_dev_filter_ctrl(struct rte_eth_dev *dev,
>   enum rte_filter_type filter_type,
>   enum rte_filter_op filter_op,
>   void *arg);
> +static void i40e_configure_registers(struct i40e_hw *hw);
> 
>  /* Default hash key buffer for RSS */
>  static uint32_t rss_key_default[I40E_PFQF_HKEY_MAX_INDEX + 1]; @@ -
> 443,6 +444,16 @@ eth_i40e_dev_init(__rte_unused struct eth_driver
> *eth_drv,
>   /* Clear PXE mode */
>   i40e_clear_pxe_mode(hw);
> 
> + /*
> +  * On X710, performance number is far from the expectation on
> recent
> +  * firmware versions. The fix for this issue may not be integrated in
> +  * the following firmware version. So the workaround in software
> driver
> +  * is needed. It needs to modify the initial values of 3 internal only
> +  * registers. Note that the workaround can be removed when it is
> fixed
> +  * in firmware in the future.
> +  */
> + i40e_configure_registers(hw);
> +
>   /* Get hw capabilities */
>   ret = i40e_get_cap(hw);
>   if (ret != I40E_SUCCESS) {
> @@ -5294,3 +5305,81 @@ i40e_pctype_to_flowtype(enum
> i40e_filter_pctype pctype)
> 
>   return flowtype_table[pctype];
>  }
> +
> +static int
> +i40e_debug_read_register(struct i40e_hw *hw, uint32_t addr, uint64_t
> +*val) {
> + struct i40e_aq_desc desc;
> + struct i40e_aqc_debug_reg_read_write *cmd =
> + (struct i40e_aqc_debug_reg_read_write
> *)
> + enum i40e_status_code status;
> +
> + i40e_fill_default_direct_cmd_desc(,
> i40e_aqc_opc_debug_read_reg);
> + cmd->address = rte_cpu_to_le_32(addr);
> + status = i40e_asq_send_command(hw, , NULL, 0, NULL);
> + if (status < 0)
> + return status;
> +
> + *val = ((uint64_t)(rte_le_to_cpu_32(cmd->value_high)) <<
> (CHAR_BIT *
> + sizeof(uint32_t))) + rte_le_to_cpu_32(cmd-
> >value_low);
> +

Do we need to add rte_le_to_cpu_64 here?

> + return status;
> +}
> +
> +/*
> + * On X710, performance number is far from the expectation on recent
> +firmware
> + * versions. The fix for this issue may not be integrated in the
> +following
> + * firmware version. So the workaround in software driver is needed. It
> +needs
> + * to modify the initial values of 3 internal only registers. Note that
> +the
> + * workaround can be removed when it is fixed in firmware in the future.
> + */
> +static void
> +i40e_configure_registers(struct i40e_hw *hw) {
> +#define I40E_GL_SWR_PRI_JOIN_MAP_0   0x26CE00
> +#define I40E_GL_SWR_PRI_JOIN_MAP_2   0x26CE08
> +#define I40E_GL_SWR_PM_UP_THR0x269FBC
> +#define I40E_GL_SWR_PRI_JOIN_MAP_0_VALUE 0x1200 #define
> +I40E_GL_SWR_PRI_JOIN_MAP_2_VALUE 0x011f0200
> +#define I40E_GL_SWR_PM_UP_THR_VALUE  0x03030303
> +
> + static const struct {
> + uint32_t addr;
> + uint64_t val;
> + } reg_table[] = {
> + {I40E_GL_SWR_PRI_JOIN_MAP_0,
> I40E_GL_SWR_PRI_JOIN_MAP_0_VALUE},
> + {I40E_GL_SWR_PRI_JOIN_MAP_2,
> I40E_GL_SWR_PRI_JOIN_MAP_2_VALUE},
> + {I40E_GL_SWR_PM_UP_THR,
> I40E_GL_SWR_PM_UP_THR_VALUE},
> + };
> + uint64_t reg;
> + uint32_t i;
> + int ret;
> +
> + /* Below fix is for X710 only */
> + if (i40e_is_40G_device(hw->device_id))
> + return;
> +
> + for (i = 0; i < RTE_DIM(reg_table); i++) {
> + ret = i40e_debug_read_register(hw, reg_table[i].addr, );
> + if (ret < 0) {
> + PMD_DRV_LOG(ERR, "Failed to read from
> 0x%"PRIx32,
> + reg_table[i].addr);
> + 

[dpdk-dev] Packet Rx issue with DPDK1.8

2015-01-08 Thread Prashant Upadhyaya
Hi,

I am migrating from DPDK1.7 to DPDK1.8.
My application works fine with DPDK1.7.
I am using 10 Gb Intel 82599 NIC.
I have jumbo frames enabled, with max_rx_pkt_len = 10232
My mbuf dataroom size is 2048+headroom
So naturally the ixgbe_recv_scattered_pkts driver function is triggered for
receiving.
This works with DPDK1.7 and my app receives packets.
However, it does not work with DPDK1.8 somehow.I don't receive any packets.

So, I increased the mbuf data room size in my application to a higher value
so that the function ixgbe_recv_scattered_pkts is not enabled (I believe
ixgbe_recv_pkts will be used in this case), and now my application starts
getting packets with DPDK1.8 and the entire application usecase works fine
(ofcourse my application had to adapt to the mbuf structure changes which I
have done)

I am kind of coming to the conclusion that ixgbe_recv_scattered_pkts has
something broken in DPDK1.8 as compared to the earlier versions by the
above empirical evidence.

Has anybody else faced a similar issue ?

Regards
-Prashant


[dpdk-dev] rte_mempool_create fails with ENOMEM

2015-01-08 Thread Newman Poborsky
I finally found the time to try this and I noticed that on a server
with 1 NUMA node, this works, but if  server has 2 NUMA nodes than by
default memory policy, reserved hugepages are divided on each node and
again DPDK test app fails for the reason already mentioned. I found
out that 'solution' for this is to deallocate hugepages on node1
(after boot) and leave them only on node0:
echo 0 > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages

Could someone please explain what changes when there are hugepages on
both nodes? Does this cause some memory fragmentation so that there
aren't enough contiguous segments? If so, how?

Thanks!

Newman

On Mon, Dec 22, 2014 at 11:48 AM, Newman Poborsky  
wrote:
> On Sat, Dec 20, 2014 at 2:34 AM, Stephen Hemminger
>  wrote:
>> You can reserve hugepages on the kernel cmdline (GRUB).
>
> Great, thanks, I'll try that!
>
> Newman
>
>>
>> On Fri, Dec 19, 2014 at 12:13 PM, Newman Poborsky 
>> wrote:
>>>
>>> On Thu, Dec 18, 2014 at 9:03 PM, Ananyev, Konstantin <
>>> konstantin.ananyev at intel.com> wrote:
>>>
>>> >
>>> >
>>> > > -Original Message-
>>> > > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Ananyev,
>>> > > Konstantin
>>> > > Sent: Thursday, December 18, 2014 5:43 PM
>>> > > To: Newman Poborsky; dev at dpdk.org
>>> > > Subject: Re: [dpdk-dev] rte_mempool_create fails with ENOMEM
>>> > >
>>> > > Hi
>>> > >
>>> > > > -Original Message-
>>> > > > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Newman 
>>> > > > Poborsky
>>> > > > Sent: Thursday, December 18, 2014 1:26 PM
>>> > > > To: dev at dpdk.org
>>> > > > Subject: [dpdk-dev] rte_mempool_create fails with ENOMEM
>>> > > >
>>> > > > Hi,
>>> > > >
>>> > > > could someone please provide any explanation why sometimes mempool
>>> > creation
>>> > > > fails with ENOMEM?
>>> > > >
>>> > > > I run my test app several times without any problems and then I
>>> > > > start
>>> > > > getting ENOMEM error when creating mempool that are used for
>>> > > > packets.
>>> > I try
>>> > > > to delete everything from /mnt/huge, I increase the number of huge
>>> > pages,
>>> > > > remount /mnt/huge but nothing helps.
>>> > > >
>>> > > > There is more than enough memory on server. I tried to debug
>>> > > > rte_mempool_create() call and it seems that after server is
>>> > > > restarted
>>> > free
>>> > > > mem segments are bigger than 2MB, but after running test app for
>>> > several
>>> > > > times, it seems that all free mem segments have a size of 2MB, and
>>> > since I
>>> > > > am requesting 8MB for my packet mempool, this fails.  I'm not really
>>> > sure
>>> > > > that this conclusion is correct.
>>> > >
>>> > > Yes,rte_mempool_create uses  rte_memzone_reserve() to allocate
>>> > > single physically continuous chunk of memory.
>>> > > If no such chunk exist, then it would fail.
>>> > > Why physically continuous?
>>> > > Main reason - to make things easier for us, as in that case we don't
>>> > have to worry
>>> > > about situation when mbuf crosses page boundary.
>>> > > So you can overcome that problem like that:
>>> > > Allocate max amount of memory you would need to hold all mbufs in
>>> > > worst
>>> > case (all pages physically disjoint)
>>> > > using rte_malloc().
>>> >
>>> > Actually my wrong: rte_malloc()s wouldn't help you here.
>>> > You probably need to allocate some external (not managed by EAL) memory
>>> > in
>>> > that case,
>>> > may be mmap() with MAP_HUGETLB, or something similar.
>>> >
>>> > > Figure out it's physical mappings.
>>> > > Call  rte_mempool_xmem_create().
>>> > > You can look at: app/test-pmd/mempool_anon.c as a reference.
>>> > > It uses same approach to create mempool over 4K pages.
>>> > >
>>> > > We probably add similar function into mempool API
>>> > (create_scatter_mempool or something)
>>> > > or just add a new flag (USE_SCATTER_MEM) into rte_mempool_create().
>>> > > Though right now it is not there.
>>> > >
>>> > > Another quick alternative - use 1G pages.
>>> > >
>>> > > Konstantin
>>> >
>>>
>>>
>>> Ok, thanks for the explanation. I understand that this is probably an OS
>>> question more than DPDK, but is there a way to again allocate a contiguous
>>> memory for n-th run of my test app?  It seems that hugepages get
>>> divded/separated to individual 2MB hugepage. Shouldn't OS's memory
>>> management system try to group those hupages back to one contiguous chunk
>>> once my app/process is done?   Again, I know very little about Linux
>>> memory
>>> management and hugepages, so forgive me if this is a stupid question.
>>> Is rebooting the OS the only way to deal with this problem?  Or should I
>>> just try to use 1GB hugepages?
>>>
>>> p.s. Konstantin, sorry for the double reply, I accidentally forgot to
>>> include dev list in my first reply  :)
>>>
>>> Newman
>>>
>>> >
>>> > > >
>>> > > > Does anybody have any idea what to check and how running my test app
>>> > > > several times affects hugepages?
>>> > > >
>>> > > > For me, this 

[dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and csum forwarding engine

2015-01-08 Thread Liu, Jijiang
Hi,

> -Original Message-
> From: Ananyev, Konstantin
> Sent: Wednesday, January 7, 2015 8:07 PM
> To: Liu, Jijiang; 'Olivier MATZ'
> Cc: dev at dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and
> csum forwarding engine
> 
> 
> 
> > -Original Message-
> > From: Liu, Jijiang
> > Sent: Wednesday, January 07, 2015 11:39 AM
> > To: Ananyev, Konstantin; 'Olivier MATZ'
> > Cc: dev at dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and
> > csum forwarding engine
> >
> > Hi Konstantin,
> >
> > > -Original Message-
> > > From: Ananyev, Konstantin
> > > Sent: Wednesday, January 7, 2015 5:59 PM
> > > To: Liu, Jijiang; 'Olivier MATZ'
> > > Cc: dev at dpdk.org
> > > Subject: RE: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum command
> > > and csum forwarding engine
> > >
> > > Hi Frank,
> > >
> > > > -Original Message-
> > > > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Liu, Jijiang
> > > > Sent: Wednesday, January 07, 2015 2:04 AM
> > > > To: 'Olivier MATZ'
> > > > Cc: dev at dpdk.org
> > > > Subject: Re: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum command
> > > > and csum forwarding engine
> > > >
> > > > Hi Olivier,
> > > >
> > > > > -Original Message-
> > > > > From: Olivier MATZ [mailto:olivier.matz at 6wind.com]
> > > > > Sent: Saturday, December 13, 2014 12:33 AM
> > > > > To: Liu, Jijiang
> > > > > Cc: dev at dpdk.org
> > > > > Subject: Re: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum
> > > > > command and csum forwarding engine
> > > > >
> > > > > Hello,
> > > > >
> > > > > On 12/12/2014 04:48 AM, Liu, Jijiang wrote:
> > > > > > The 'hw/sw' option  is used to set/clear the flag of enabling
> > > > > > TX tunneling packet
> > > > > checksum hardware offload in testpmd application.
> > > > >
> > > > > This is not clear at all.
> > > > > In your command, there is (hw|sw|none).
> > > > > Are you talking about inner or outer?
> > > > > Is this command useful for any kind of packet?
> > > > > How does it combine with "tx_checksum set outer-ip (hw|sw)"?
> > > > >
> > > >
> > > > I rethink these TX checksum commands in this patch set and agree
> > > > with you that we should make some changes for having clear meaning for
> them.
> > > >
> > > > There are  3 commands in patch set as follows, 1. tx_checksum set
> > > > tunnel (hw|sw|none) (port-id)
> > > >
> > > > Now I also think the command 1 may confuse user, they probably
> > > > don't understand  why we need 'hw' or 'sw' option and when  to use
> > > > the two option, so I will replace the command with 'tx_checksum
> > > > set hw-tunnel-mode
> > > (on|off) (port-id)' command.
> > >
> > > I am a bit confused here, could you explain what would be a
> > > behaviour for 'on' and 'off'?
> > > Konstantin
> >
> > I have explained the behaviour for 'on' and'off' below,
> >
> > The command 'tx_checksum set hw-tunnel-mode (on|off)  (port-id)' is
> > used to set/clear  TESTPMD_TX_OFFLOAD_TUNNEL_CKSUM flag.
> >
> > Actually, the PKT_TX_UDP_TUNNEL_PKT offload flag will be set if the
> > testpmd flag is set, which means to tell HW treat  that transmit
> > packet as a tunneling packet to do checksum offload  When 'on' is set, 
> > which is
> able to meet Method B.1 and method C.
> >
> > When 'off' is set, the TESTPMD_TX_OFFLOAD_TUNNEL_CKSUM is not needed
> > to set, so the PKT_TX_UDP_TUNNEL_PKT offload flag is not needed to
> > set,  then HW treat  that transmit packet as a non-tunneling packet. It is 
> > able to
> meet Method B.2.
> >
> > Is the explanation not clear?
> 
> Ok, and how I can set method A (testpmd treat all packets as non-tunnelling 
> and
> never look beyond outer L4 header) then?
> Konstantin


> > > > As to case A, I think it is not mandatory to cover it in csum fwd
> > > > engine for tunneling packet.

If you think the case A is essential, and it must be covered in  csum fwd, then 
we can add a command:

tx_checksum set sw-tunnel-mode (on|off)  (port-id)

if the 'off' is set ,  csum fwd engine don't check  if that packet is a 
tunneling packet and treat all packets as non-tunneling and never look beyond 
outer L4 header.

if the 'on' is set,  csum fwd engine will check if that packet is a tunneling 
packet.

And we are able to test all of cases in  
http://dpdk.org/ml/archives/dev/2014-December/009213.html

Test case A:

tx_checksum set sw-tunnel-mode off
tx_checksum set hw-tunnel-mode off
tx_checksum set  ip   hw

test case B.1:

tx_checksum set sw-tunnel-mode on
tx_checksum set hw-tunnel-mode on
tx_checksum set  ip   hw
tx_checksum set  tcp   hw

test case B.2:

tx_checksum set sw-tunnel-mode on
tx_checksum set hw-tunnel-mode off
tx_checksum set  ip   hw

test case C:

tx_checksum set sw-tunnel-mode on
tx_checksum set hw-tunnel-mode on
tx_checksum set  outer-ip   hw
tx_checksum set  ip   hw
tx_checksum set  tcp   hw


 In addition, the reason of discarding ' tx_checksum set  tunnel (hw|sw|none) 
(port-id)' command is that  user probably 

[dpdk-dev] [PATCH v5 3/6] ixgbe: Get VF queue number

2015-01-08 Thread Vlad Zolotarov

On 01/07/15 08:32, Ouyang Changchun wrote:
> Get the available Rx and Tx queue number when receiving IXGBE_VF_GET_QUEUES 
> message from VF.
>
> Signed-off-by: Changchun Ouyang 

Reviewed-by: Vlad Zolotarov 

>
> changes in v5
>- Add some 'FIX ME' comments for IXGBE_VF_TRANS_VLAN.
>
> ---
>   lib/librte_pmd_ixgbe/ixgbe_pf.c | 40 
> +++-
>   1 file changed, 39 insertions(+), 1 deletion(-)
>
> diff --git a/lib/librte_pmd_ixgbe/ixgbe_pf.c b/lib/librte_pmd_ixgbe/ixgbe_pf.c
> index 495aff5..dbda9b5 100644
> --- a/lib/librte_pmd_ixgbe/ixgbe_pf.c
> +++ b/lib/librte_pmd_ixgbe/ixgbe_pf.c
> @@ -53,6 +53,8 @@
>   #include "ixgbe_ethdev.h"
>   
>   #define IXGBE_MAX_VFTA (128)
> +#define IXGBE_VF_MSG_SIZE_DEFAULT 1
> +#define IXGBE_VF_GET_QUEUE_MSG_SIZE 5
>   
>   static inline uint16_t
>   dev_num_vf(struct rte_eth_dev *eth_dev)
> @@ -491,9 +493,41 @@ ixgbe_negotiate_vf_api(struct rte_eth_dev *dev, uint32_t 
> vf, uint32_t *msgbuf)
>   }
>   
>   static int
> +ixgbe_get_vf_queues(struct rte_eth_dev *dev, uint32_t vf, uint32_t *msgbuf)
> +{
> + struct ixgbe_vf_info *vfinfo =
> + *IXGBE_DEV_PRIVATE_TO_P_VFDATA(dev->data->dev_private);
> + uint32_t default_q = vf * RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool;
> +
> + /* Verify if the PF supports the mbox APIs version or not */
> + switch (vfinfo[vf].api_version) {
> + case ixgbe_mbox_api_20:
> + case ixgbe_mbox_api_11:
> + break;
> + default:
> + return -1;
> + }
> +
> + /* Notify VF of Rx and Tx queue number */
> + msgbuf[IXGBE_VF_RX_QUEUES] = RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool;
> + msgbuf[IXGBE_VF_TX_QUEUES] = RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool;
> +
> + /* Notify VF of default queue */
> + msgbuf[IXGBE_VF_DEF_QUEUE] = default_q;
> +
> + /*
> +  * FIX ME if it needs fill msgbuf[IXGBE_VF_TRANS_VLAN]
> +  * for VLAN strip or VMDQ_DCB or VMDQ_DCB_RSS
> +  */
> +
> + return 0;
> +}
> +
> +static int
>   ixgbe_rcv_msg_from_vf(struct rte_eth_dev *dev, uint16_t vf)
>   {
>   uint16_t mbx_size = IXGBE_VFMAILBOX_SIZE;
> + uint16_t msg_size = IXGBE_VF_MSG_SIZE_DEFAULT;
>   uint32_t msgbuf[IXGBE_VFMAILBOX_SIZE];
>   int32_t retval;
>   struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> @@ -537,6 +571,10 @@ ixgbe_rcv_msg_from_vf(struct rte_eth_dev *dev, uint16_t 
> vf)
>   case IXGBE_VF_API_NEGOTIATE:
>   retval = ixgbe_negotiate_vf_api(dev, vf, msgbuf);
>   break;
> + case IXGBE_VF_GET_QUEUES:
> + retval = ixgbe_get_vf_queues(dev, vf, msgbuf);
> + msg_size = IXGBE_VF_GET_QUEUE_MSG_SIZE;
> + break;
>   default:
>   PMD_DRV_LOG(DEBUG, "Unhandled Msg %8.8x", (unsigned)msgbuf[0]);
>   retval = IXGBE_ERR_MBX;
> @@ -551,7 +589,7 @@ ixgbe_rcv_msg_from_vf(struct rte_eth_dev *dev, uint16_t 
> vf)
>   
>   msgbuf[0] |= IXGBE_VT_MSGTYPE_CTS;
>   
> - ixgbe_write_mbx(hw, msgbuf, 1, vf);
> + ixgbe_write_mbx(hw, msgbuf, msg_size, vf);
>   
>   return retval;
>   }



[dpdk-dev] [PATCH v5 5/6] ixgbe: Config VF RSS

2015-01-08 Thread Vlad Zolotarov

On 01/07/15 08:32, Ouyang Changchun wrote:
> It needs config RSS and IXGBE_MRQC and IXGBE_VFPSRTYPE to enable VF RSS.
>
> The psrtype will determine how many queues the received packets will 
> distribute to,
> and the value of psrtype should depends on both facet: max VF rxq number which
> has been negotiated with PF, and the number of rxq specified in config on 
> guest.
>
> Signed-off-by: Changchun Ouyang 
>
> Changes in v4:
>   - the number of rxq from config should be power of 2 and should not bigger 
> than
>  max VF rxq number(negotiated between guest and host).
>
> ---
>   lib/librte_pmd_ixgbe/ixgbe_pf.c   |  15 ++
>   lib/librte_pmd_ixgbe/ixgbe_rxtx.c | 103 
> +-
>   2 files changed, 106 insertions(+), 12 deletions(-)
>
> diff --git a/lib/librte_pmd_ixgbe/ixgbe_pf.c b/lib/librte_pmd_ixgbe/ixgbe_pf.c
> index dbda9b5..93f6e43 100644
> --- a/lib/librte_pmd_ixgbe/ixgbe_pf.c
> +++ b/lib/librte_pmd_ixgbe/ixgbe_pf.c
> @@ -187,6 +187,21 @@ int ixgbe_pf_host_configure(struct rte_eth_dev *eth_dev)
>   IXGBE_WRITE_REG(hw, IXGBE_MPSAR_LO(hw->mac.num_rar_entries), 0);
>   IXGBE_WRITE_REG(hw, IXGBE_MPSAR_HI(hw->mac.num_rar_entries), 0);
>   
> + /*
> +  * VF RSS can support at most 4 queues for each VF, even if
> +  * 8 queues are available for each VF, it need refine to 4
> +  * queues here due to this limitation, otherwise no queue
> +  * will receive any packet even RSS is enabled.
> +  */
> + if (eth_dev->data->dev_conf.rxmode.mq_mode == ETH_MQ_RX_VMDQ_RSS) {
> + if (RTE_ETH_DEV_SRIOV(eth_dev).nb_q_per_pool == 8) {
> + RTE_ETH_DEV_SRIOV(eth_dev).active = ETH_32_POOLS;
> + RTE_ETH_DEV_SRIOV(eth_dev).nb_q_per_pool = 4;
> + RTE_ETH_DEV_SRIOV(eth_dev).def_pool_q_idx =
> + dev_num_vf(eth_dev) * 4;
> + }
> + }
> +
>   /* set VMDq map to default PF pool */
>   hw->mac.ops.set_vmdq(hw, 0, RTE_ETH_DEV_SRIOV(eth_dev).def_vmdq_idx);
>   
> diff --git a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c 
> b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
> index f69abda..e83a9ab 100644
> --- a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
> +++ b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
> @@ -3327,6 +3327,68 @@ ixgbe_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
>   }
>   
>   static int
> +ixgbe_config_vf_rss(struct rte_eth_dev *dev)
> +{
> + struct ixgbe_hw *hw;
> + uint32_t mrqc;
> +
> + ixgbe_rss_configure(dev);
> +
> + hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> +
> + /* MRQC: enable VF RSS */
> + mrqc = IXGBE_READ_REG(hw, IXGBE_MRQC);
> + mrqc &= ~IXGBE_MRQC_MRQE_MASK;
> + switch (RTE_ETH_DEV_SRIOV(dev).active) {
> + case ETH_64_POOLS:
> + mrqc |= IXGBE_MRQC_VMDQRSS64EN;

> + break;
> +
> + case ETH_32_POOLS:
> + case ETH_16_POOLS:

Isn't ETH_16_POOLS mode is invalid for VF RSS? It's what both spec 
states and what u handle in this patch in ixgbe_pf_host_configure(). 
IMHO it would be better to treat this mode value as an error here since 
if u get it here it indicates of a SW bug.

> + mrqc |= IXGBE_MRQC_VMDQRSS32EN;
> + break;
> +
> + default:
> + PMD_INIT_LOG(ERR, "Invalid pool number in IOV mode");
> + return -EINVAL;
> + }
> +
> + IXGBE_WRITE_REG(hw, IXGBE_MRQC, mrqc);
> +
> + return 0;
> +}
> +
> +static int
> +ixgbe_config_vf_default(struct rte_eth_dev *dev)
> +{
> + struct ixgbe_hw *hw =
> + IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> +
> + switch (RTE_ETH_DEV_SRIOV(dev).active) {
> + case ETH_64_POOLS:
> + IXGBE_WRITE_REG(hw, IXGBE_MRQC,
> + IXGBE_MRQC_VMDQEN);
> + break;
> +
> + case ETH_32_POOLS:
> + IXGBE_WRITE_REG(hw, IXGBE_MRQC,
> + IXGBE_MRQC_VMDQRT4TCEN);
> + break;
> +
> + case ETH_16_POOLS:
> + IXGBE_WRITE_REG(hw, IXGBE_MRQC,
> + IXGBE_MRQC_VMDQRT8TCEN);
> + break;
> + default:
> + PMD_INIT_LOG(ERR,
> + "invalid pool number in IOV mode");
> + break;
> + }
> + return 0;
> +}
> +
> +static int
>   ixgbe_dev_mq_rx_configure(struct rte_eth_dev *dev)
>   {
>   struct ixgbe_hw *hw =
> @@ -3358,24 +3420,25 @@ ixgbe_dev_mq_rx_configure(struct rte_eth_dev *dev)
>   default: ixgbe_rss_disable(dev);
>   }
>   } else {
> - switch (RTE_ETH_DEV_SRIOV(dev).active) {
>   /*
>* SRIOV active scheme
> -  * FIXME if support DCB/RSS together with VMDq & SRIOV
> +  * Support RSS together with VMDq & SRIOV
>*/
> - case ETH_64_POOLS:
> - IXGBE_WRITE_REG(hw, IXGBE_MRQC, IXGBE_MRQC_VMDQEN);
> - break;
> -
> - case 

[dpdk-dev] [PATCH v5 6/6] testpmd: Set Rx VMDq RSS mode

2015-01-08 Thread Vlad Zolotarov

On 01/07/15 08:32, Ouyang Changchun wrote:
> Set VMDq RSS mode if it has VF(VF number is more than 1) and has RSS 
> information.
>
> Signed-off-by: Changchun Ouyang 

Reviewed-by: Vlad Zolotarov 

Some nitpicking below... ;)

>
> changes in v5
>- Assign txmode.mq_mode with ETH_MQ_TX_NONE explicitly;
>- Remove one line wrong comment.
>
> ---
>   app/test-pmd/testpmd.c | 15 ++-
>   1 file changed, 14 insertions(+), 1 deletion(-)
>
> diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
> index 8c69756..64fd4ee 100644
> --- a/app/test-pmd/testpmd.c
> +++ b/app/test-pmd/testpmd.c
> @@ -1700,7 +1700,6 @@ init_port_config(void)
>   port->dev_conf.rx_adv_conf.rss_conf.rss_hf = 0;
>   }
>   
> - /* In SR-IOV mode, RSS mode is not available */
>   if (port->dcb_flag == 0 && port->dev_info.max_vfs == 0) {
>   if( port->dev_conf.rx_adv_conf.rss_conf.rss_hf != 0)
>   port->dev_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
> @@ -1708,6 +1707,20 @@ init_port_config(void)
>   port->dev_conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
>   }
>   
> + if (port->dev_info.max_vfs != 0) {
> + if (port->dev_conf.rx_adv_conf.rss_conf.rss_hf != 0) {
> + port->dev_conf.rxmode.mq_mode =
> + ETH_MQ_RX_VMDQ_RSS;
> + port->dev_conf.txmode.mq_mode =
> + ETH_MQ_TX_NONE;
> + } else {
> + port->dev_conf.rxmode.mq_mode =
> + ETH_MQ_RX_NONE;
> + port->dev_conf.txmode.mq_mode =
> + ETH_MQ_TX_NONE;

It seems that txmode.mq_mode assignment may be taken out of the 
"if-else" statement here... ;)

> + }
> + }
> +
>   port->rx_conf.rx_thresh = rx_thresh;
>   port->rx_conf.rx_free_thresh = rx_free_thresh;
>   port->rx_conf.rx_drop_en = rx_drop_en;



[dpdk-dev] [PATCH v5 0/6] Enable VF RSS for Niantic

2015-01-08 Thread Vlad Zolotarov

On 01/07/15 08:32, Ouyang Changchun wrote:
> This patch enables VF RSS for Niantic, which allow each VF having at most 4 
> queues.
> The actual queue number per VF depends on the total number of pool, which is
> determined by the max number of VF at PF initialization stage and the number 
> of
> queue specified in config:
> 1) If the max number of VF is in the range from 1 to 32, and the number of 
> rxq is 4
> ('--rxq 4' in testpmd), then there is totally 32 pools(ETH_32_POOLS), and 
> each VF
> have 4 queues;
>   
> 2)If the max number of VF is in the range from 33 to 64, and the number of 
> rxq is 2
> ('--rxq 2' in testpmd), then there is totally 64 pools(ETH_64_POOLS), and 
> each VF
> have 2 queues;
>   
> On host, to enable VF RSS functionality, rx mq mode should be set as 
> ETH_MQ_RX_VMDQ_RSS
> or ETH_MQ_RX_RSS mode, and SRIOV mode should be activated(max_vfs >= 1).
> It also needs config VF RSS information like hash function, RSS key, RSS key 
> length.
>   
> The limitation for Niantic VF RSS is:
> the hash and key are shared among PF and all VF, the RETA table with 128 
> entries are
> also shared among PF and all VF. So it could not to provide a method to query 
> the hash
> and reta content per VF on guest, while, if possible, please query them on 
> host(PF) for
> the shared RETA information.

I've acked PATCH1 and PATCH2 already before and since there are no 
changes in them, pls.,  consider them ACKed... ;)

>
> changes in v5:
>- Fix minor issue and some comments;
>
> changes in v4:
>- Extract a function to remove embeded switch-case statement;
>- Check whether RX queue number is a valid one, otherwise return error;
>- Update the description a bit;
>   
> changes in v3:
>- More cleanup;
>   
> changes in v2:
>- Update the description;
>- Use receiving queue number('--rxq ') specified in config to 
> determine the
>  number of pool and the number of queue per VF;
>   
> changes in v1:
>- Config VF RSS;
>
> Changchun Ouyang (6):
>ixgbe: Code cleanup
>ixgbe: Negotiate VF API version
>ixgbe: Get VF queue number
>ether: Check VMDq RSS mode
>ixgbe: Config VF RSS
>testpmd: Set Rx VMDq RSS mode
>
>   app/test-pmd/testpmd.c  |  15 +++-
>   lib/librte_ether/rte_ethdev.c   |  50 +++--
>   lib/librte_pmd_ixgbe/ixgbe_ethdev.h |   1 +
>   lib/librte_pmd_ixgbe/ixgbe_pf.c |  80 -
>   lib/librte_pmd_ixgbe/ixgbe_rxtx.c   | 138 
> 
>   5 files changed, 248 insertions(+), 36 deletions(-)
>



[dpdk-dev] [PATCH v2 3/5] vhost: enable promisc mode and config VMDQ offload register for multicast feature

2015-01-08 Thread Xie, Huawei
> diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
> index 27ba175..744156c 100644
> --- a/lib/librte_vhost/virtio-net.c
> +++ b/lib/librte_vhost/virtio-net.c
> @@ -68,7 +68,9 @@ static struct virtio_net_device_ops const *notify_ops;
>  static struct virtio_net_config_ll   *ll_root;
> 
>  /* Features supported by this application. RX merge buffers are enabled by
> default. */
> -#define VHOST_SUPPORTED_FEATURES (1ULL << VIRTIO_NET_F_MRG_RXBUF)
> +#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF)
> | \
> + (1ULL << VIRTIO_NET_F_CTRL_RX))
> +
CTRL_RX is dependent on CTRL_VQ.
CTRL_VQ should be enabled if CTRL_RX is enabled.
Observed that virtio-net driver will crash if CTRL_VQ isn't enabled in 
vhost-user case.
/* Caller should know better */
BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ||
(out + in > VIRTNET_SEND_COMMAND_SG_MAX)); 
>  static uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
> 
>  /* Line size for reading maps file. */
> --
> 1.8.4.2



[dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and csum forwarding engine

2015-01-08 Thread Ananyev, Konstantin
Hi Frank,

> -Original Message-
> From: Liu, Jijiang
> Sent: Thursday, January 08, 2015 8:52 AM
> To: Ananyev, Konstantin; 'Olivier MATZ'
> Cc: dev at dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and csum 
> forwarding engine
> 
> Hi,
> 
> > -Original Message-
> > From: Ananyev, Konstantin
> > Sent: Wednesday, January 7, 2015 8:07 PM
> > To: Liu, Jijiang; 'Olivier MATZ'
> > Cc: dev at dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and
> > csum forwarding engine
> >
> >
> >
> > > -Original Message-
> > > From: Liu, Jijiang
> > > Sent: Wednesday, January 07, 2015 11:39 AM
> > > To: Ananyev, Konstantin; 'Olivier MATZ'
> > > Cc: dev at dpdk.org
> > > Subject: RE: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum command and
> > > csum forwarding engine
> > >
> > > Hi Konstantin,
> > >
> > > > -Original Message-
> > > > From: Ananyev, Konstantin
> > > > Sent: Wednesday, January 7, 2015 5:59 PM
> > > > To: Liu, Jijiang; 'Olivier MATZ'
> > > > Cc: dev at dpdk.org
> > > > Subject: RE: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum command
> > > > and csum forwarding engine
> > > >
> > > > Hi Frank,
> > > >
> > > > > -Original Message-
> > > > > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Liu, Jijiang
> > > > > Sent: Wednesday, January 07, 2015 2:04 AM
> > > > > To: 'Olivier MATZ'
> > > > > Cc: dev at dpdk.org
> > > > > Subject: Re: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum command
> > > > > and csum forwarding engine
> > > > >
> > > > > Hi Olivier,
> > > > >
> > > > > > -Original Message-
> > > > > > From: Olivier MATZ [mailto:olivier.matz at 6wind.com]
> > > > > > Sent: Saturday, December 13, 2014 12:33 AM
> > > > > > To: Liu, Jijiang
> > > > > > Cc: dev at dpdk.org
> > > > > > Subject: Re: [dpdk-dev] [PATCH v3 0/3] enhance TX checksum
> > > > > > command and csum forwarding engine
> > > > > >
> > > > > > Hello,
> > > > > >
> > > > > > On 12/12/2014 04:48 AM, Liu, Jijiang wrote:
> > > > > > > The 'hw/sw' option  is used to set/clear the flag of enabling
> > > > > > > TX tunneling packet
> > > > > > checksum hardware offload in testpmd application.
> > > > > >
> > > > > > This is not clear at all.
> > > > > > In your command, there is (hw|sw|none).
> > > > > > Are you talking about inner or outer?
> > > > > > Is this command useful for any kind of packet?
> > > > > > How does it combine with "tx_checksum set outer-ip (hw|sw)"?
> > > > > >
> > > > >
> > > > > I rethink these TX checksum commands in this patch set and agree
> > > > > with you that we should make some changes for having clear meaning for
> > them.
> > > > >
> > > > > There are  3 commands in patch set as follows, 1. tx_checksum set
> > > > > tunnel (hw|sw|none) (port-id)
> > > > >
> > > > > Now I also think the command 1 may confuse user, they probably
> > > > > don't understand  why we need 'hw' or 'sw' option and when  to use
> > > > > the two option, so I will replace the command with 'tx_checksum
> > > > > set hw-tunnel-mode
> > > > (on|off) (port-id)' command.
> > > >
> > > > I am a bit confused here, could you explain what would be a
> > > > behaviour for 'on' and 'off'?
> > > > Konstantin
> > >
> > > I have explained the behaviour for 'on' and'off' below,
> > >
> > > The command 'tx_checksum set hw-tunnel-mode (on|off)  (port-id)' is
> > > used to set/clear  TESTPMD_TX_OFFLOAD_TUNNEL_CKSUM flag.
> > >
> > > Actually, the PKT_TX_UDP_TUNNEL_PKT offload flag will be set if the
> > > testpmd flag is set, which means to tell HW treat  that transmit
> > > packet as a tunneling packet to do checksum offload  When 'on' is set, 
> > > which is
> > able to meet Method B.1 and method C.
> > >
> > > When 'off' is set, the TESTPMD_TX_OFFLOAD_TUNNEL_CKSUM is not needed
> > > to set, so the PKT_TX_UDP_TUNNEL_PKT offload flag is not needed to
> > > set,  then HW treat  that transmit packet as a non-tunneling packet. It 
> > > is able to
> > meet Method B.2.
> > >
> > > Is the explanation not clear?
> >
> > Ok, and how I can set method A (testpmd treat all packets as non-tunnelling 
> > and
> > never look beyond outer L4 header) then?
> > Konstantin
> 
> 
> > > > > As to case A, I think it is not mandatory to cover it in csum fwd
> > > > > engine for tunneling packet.
> 
> If you think the case A is essential, and it must be covered in  csum fwd, 
> then we can add a command:
> 
> tx_checksum set sw-tunnel-mode (on|off)  (port-id)
> 
> if the 'off' is set ,  csum fwd engine don't check  if that packet is a 
> tunneling packet and treat all packets as non-tunneling and never
> look beyond outer L4 header.
> 
> if the 'on' is set,  csum fwd engine will check if that packet is a tunneling 
> packet.
> 
> And we are able to test all of cases in  
> http://dpdk.org/ml/archives/dev/2014-December/009213.html
> 
> Test case A:
> 
> tx_checksum set sw-tunnel-mode off
> tx_checksum set hw-tunnel-mode off
> tx_checksum set  ip   hw
> 
> test case 

[dpdk-dev] Packet Rx issue with DPDK1.8

2015-01-08 Thread Bruce Richardson
On Thu, Jan 08, 2015 at 01:40:54PM +0530, Prashant Upadhyaya wrote:
> Hi,
> 
> I am migrating from DPDK1.7 to DPDK1.8.
> My application works fine with DPDK1.7.
> I am using 10 Gb Intel 82599 NIC.
> I have jumbo frames enabled, with max_rx_pkt_len = 10232
> My mbuf dataroom size is 2048+headroom
> So naturally the ixgbe_recv_scattered_pkts driver function is triggered for
> receiving.
> This works with DPDK1.7 and my app receives packets.
> However, it does not work with DPDK1.8 somehow.I don't receive any packets.
> 
> So, I increased the mbuf data room size in my application to a higher value
> so that the function ixgbe_recv_scattered_pkts is not enabled (I believe
> ixgbe_recv_pkts will be used in this case), and now my application starts
> getting packets with DPDK1.8 and the entire application usecase works fine
> (ofcourse my application had to adapt to the mbuf structure changes which I
> have done)
> 
> I am kind of coming to the conclusion that ixgbe_recv_scattered_pkts has
> something broken in DPDK1.8 as compared to the earlier versions by the
> above empirical evidence.
> 
> Has anybody else faced a similar issue ?
> 
> Regards
> -Prashant

This is worrying to hear. In 1.8, there is now the receive_scattered_pkts_vec
function which manages changed mbufs. This was tested - both in development and
in validation - before release, but since it's new code, it's entirely possible
we missed something. Can you perhaps try disabling the vector driver in 1.8,
and see if receiving scattered packets/chained mbufs works? 

Regards,
/Bruce


[dpdk-dev] [PATCH 1/3] librte_reorder: New reorder library

2015-01-08 Thread Pattan, Reshma


> -Original Message-
> From: Neil Horman [mailto:nhorman at tuxdriver.com]
> Sent: Wednesday, January 7, 2015 5:45 PM
> To: Pattan, Reshma
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 1/3] librte_reorder: New reorder library
> 
> On Wed, Jan 07, 2015 at 04:39:11PM +, Reshma Pattan wrote:
> > From: Reshma Pattan 
> >
> > 1)New library to provide reordering of out of ordered
> > mbufs based on sequence number of mbuf. Library uses reorder 
> > buffer
> structure
> > which in tern uses two circular buffers called ready and order 
> > buffers.
> > *rte_reorder_create API creates instance of reorder buffer.
> > *rte_reorder_init API initializes given reorder buffer instance.
> > *rte_reorder_reset API resets given reorder buffer instance.
> > *rte_reorder_insert API inserts the mbuf into order circular 
> > buffer.
> > *rte_reorder_fill_overflow moves mbufs from order buffer to 
> > ready
> buffer
> > to accomodate early packets in order buffer.
> > *rte_reorder_drain API provides draining facility to fetch out
> > reordered mbufs from order and ready buffers.
> >
> > Signed-off-by: Reshma Pattan 
> > Signed-off-by: Richardson Bruce 
> > ---
> >  config/common_bsdapp   |   5 +
> >  config/common_linuxapp |   5 +
> >  lib/Makefile   |   1 +
> >  lib/librte_eal/common/include/rte_tailq_elem.h |   2 +
> >  lib/librte_mbuf/rte_mbuf.h |   3 +
> >  lib/librte_reorder/Makefile|  50 +++
> >  lib/librte_reorder/rte_reorder.c   | 464 
> > +
> >  lib/librte_reorder/rte_reorder.h   | 184 ++
> >  8 files changed, 714 insertions(+)
> >  create mode 100644 lib/librte_reorder/Makefile  create mode 100644
> > lib/librte_reorder/rte_reorder.c  create mode 100644
> > lib/librte_reorder/rte_reorder.h
> > +
> > +int
> > +rte_reorder_insert(struct rte_reorder_buffer *b, struct rte_mbuf
> > +*mbuf) {
> > +   uint32_t offset, position;
> > +   struct cir_buffer *order_buf = >order_buf;
> > +
> > +   /*
> > +* calculate the offset from the head pointer we need to go.
> > +* The subtraction takes care of the sequence number wrapping.
> > +* For example (using 16-bit for brevity):
> > +*  min_seqn  = 0xFFFD
> > +*  mbuf_seqn = 0x0010
> > +*  offset= 0x0010 - 0xFFFD = 0x13
> > +*/
> > +   offset = mbuf->seqn - b->min_seqn;
> > +
> > +   /*
> > +* action to take depends on offset.
> > +* offset < buffer->size: the mbuf fits within the current window of
> > +*sequence numbers we can reorder. EXPECTED CASE.
> > +* offset > buffer->size: the mbuf is outside the current window. There
> > +*are a number of cases to consider:
> > +*1. The packet sequence is just outside the window, then we need
> > +*   to see about shifting the head pointer and taking any ready
> > +*   to return packets out of the ring. If there was a delayed
> > +*   or dropped packet preventing drains from shifting the window
> > +*   this case will skip over the dropped packet instead, and any
> > +*   packets dequeued here will be returned on the next drain call.
> > +*2. The packet sequence number is vastly outside our window, taken
> > +*   here as having offset greater than twice the buffer size. In
> > +*   this case, the packet is probably an old or late packet that
> > +*   was previously skipped, so just enqueue the packet for
> > +*   immediate return on the next drain call, or else return error.
> > +*/
> > +   if (offset < b->order_buf.size) {
> > +   position = (order_buf->head + offset) & order_buf->mask;
> > +   order_buf->entries[position] = mbuf;
> > +   } else if (offset < 2 * b->order_buf.size) {
> > +   if (rte_reorder_fill_overflow(b, offset - order_buf->size) <
> > +   offset - order_buf->size) {
> > +   /* Put in handling for enqueue straight to output */
> > +   rte_errno = ENOSPC;
> > +   return -1;
> > +   }
> > +   offset = mbuf->seqn - b->min_seqn;
> > +   position = (order_buf->head + offset) & order_buf->mask;
> > +   order_buf->entries[position] = mbuf;
> > +   } else {
> > +   /* Put in handling for enqueue straight to output */
> > +   rte_errno = ERANGE;
> > +   return -1;
> > +   }
> How does this work if you get two packets with the same sequence number?
> That situation seems like it would happen frequently with your example app, 
> and
> from my read of the above, you just wind up overwriting the same pointer in
> ther entries array here, which leads to silent packet loss.

Hi Neil,


[dpdk-dev] [PATCH 1/3] librte_reorder: New reorder library

2015-01-08 Thread Pattan, Reshma


> -Original Message-
> From: Richard Sanger [mailto:rsangerarj at gmail.com]
> Sent: Wednesday, January 7, 2015 9:10 PM
> To: Pattan, Reshma
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 1/3] librte_reorder: New reorder library
> 
> On 8 January 2015 at 05:39, Reshma Pattan  wrote:
> > From: Reshma Pattan 
> >
> > 1)New library to provide reordering of out of ordered
> > mbufs based on sequence number of mbuf. Library uses reorder 
> > buffer
> structure
> > which in tern uses two circular buffers called ready and order 
> > buffers.
> > *rte_reorder_create API creates instance of reorder buffer.
> > *rte_reorder_init API initializes given reorder buffer instance.
> > *rte_reorder_reset API resets given reorder buffer instance.
> > *rte_reorder_insert API inserts the mbuf into order circular 
> > buffer.
> > *rte_reorder_fill_overflow moves mbufs from order buffer to 
> > ready
> buffer
> > to accomodate early packets in order buffer.
> > *rte_reorder_drain API provides draining facility to fetch out
> > reordered mbufs from order and ready buffers.
> >
> > Signed-off-by: Reshma Pattan 
> > Signed-off-by: Richardson Bruce 
> > ---
> >  config/common_bsdapp   |   5 +
> >  config/common_linuxapp |   5 +
> >  lib/Makefile   |   1 +
> >  lib/librte_eal/common/include/rte_tailq_elem.h |   2 +
> >  lib/librte_mbuf/rte_mbuf.h |   3 +
> >  lib/librte_reorder/Makefile|  50 +++
> >  lib/librte_reorder/rte_reorder.c   | 464 
> > +
> >  lib/librte_reorder/rte_reorder.h   | 184 ++
> >  8 files changed, 714 insertions(+)
> >  create mode 100644 lib/librte_reorder/Makefile  create mode 100644
> > lib/librte_reorder/rte_reorder.c  create mode 100644
> > lib/librte_reorder/rte_reorder.h
> >
> > diff --git a/config/common_bsdapp b/config/common_bsdapp index
> > 9177db1..e3e0e94 100644
> > --- a/config/common_bsdapp
> > +++ b/config/common_bsdapp
> > @@ -334,6 +334,11 @@ CONFIG_RTE_SCHED_PORT_N_GRINDERS=8
> >  CONFIG_RTE_LIBRTE_DISTRIBUTOR=y
> >
> >  #
> > +# Compile the reorder library
> > +#
> > +CONFIG_RTE_LIBRTE_REORDER=y
> > +
> > +#
> >  # Compile librte_port
> >  #
> >  CONFIG_RTE_LIBRTE_PORT=y
> > diff --git a/config/common_linuxapp b/config/common_linuxapp index
> > 2f9643b..b5ec730 100644
> > --- a/config/common_linuxapp
> > +++ b/config/common_linuxapp
> > @@ -342,6 +342,11 @@ CONFIG_RTE_SCHED_PORT_N_GRINDERS=8
> >  CONFIG_RTE_LIBRTE_DISTRIBUTOR=y
> >
> >  #
> > +# Compile the reorder library
> > +#
> > +CONFIG_RTE_LIBRTE_REORDER=y
> > +
> > +#
> >  # Compile librte_port
> >  #
> >  CONFIG_RTE_LIBRTE_PORT=y
> > diff --git a/lib/Makefile b/lib/Makefile index 0ffc982..5919d32 100644
> > --- a/lib/Makefile
> > +++ b/lib/Makefile
> > @@ -65,6 +65,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) +=
> > librte_distributor
> >  DIRS-$(CONFIG_RTE_LIBRTE_PORT) += librte_port
> >  DIRS-$(CONFIG_RTE_LIBRTE_TABLE) += librte_table
> >  DIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += librte_pipeline
> > +DIRS-$(CONFIG_RTE_LIBRTE_REORDER) += librte_reorder
> >
> >  ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
> >  DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni diff --git
> > a/lib/librte_eal/common/include/rte_tailq_elem.h
> > b/lib/librte_eal/common/include/rte_tailq_elem.h
> > index f74fc7c..3013869 100644
> > --- a/lib/librte_eal/common/include/rte_tailq_elem.h
> > +++ b/lib/librte_eal/common/include/rte_tailq_elem.h
> > @@ -84,6 +84,8 @@ rte_tailq_elem(RTE_TAILQ_ACL, "RTE_ACL")
> >
> >  rte_tailq_elem(RTE_TAILQ_DISTRIBUTOR, "RTE_DISTRIBUTOR")
> >
> > +rte_tailq_elem(RTE_TAILQ_REORDER, "RTE_REORDER")
> > +
> >  rte_tailq_end(RTE_TAILQ_NUM)
> >
> >  #undef rte_tailq_elem
> > diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> > index 16059c6..ed27eb8 100644
> > --- a/lib/librte_mbuf/rte_mbuf.h
> > +++ b/lib/librte_mbuf/rte_mbuf.h
> > @@ -262,6 +262,9 @@ struct rte_mbuf {
> > uint32_t usr; /**< User defined tags. See 
> > @rte_distributor_process
> */
> > } hash;   /**< hash information */
> >
> > +   /* sequence number - field used in distributor and reorder library 
> > */
> > +   uint32_t seqn;
> > +
> > /* second cache line - fields only used in slow path or on TX */
> > MARKER cacheline1 __rte_cache_aligned;
> >
> > diff --git a/lib/librte_reorder/Makefile b/lib/librte_reorder/Makefile
> > new file mode 100644 index 000..12b916f
> > --- /dev/null
> > +++ b/lib/librte_reorder/Makefile
> > @@ -0,0 +1,50 @@
> > +#   BSD LICENSE
> > +#
> > +#   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> > +#   All rights reserved.
> > +#
> > +#   Redistribution and use in source and binary forms, with or without
> > 

[dpdk-dev] [RFC PATCH 0/7] support multi-phtread per lcore

2015-01-08 Thread Ananyev, Konstantin

Hi Steve,

> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Liang, Cunming
> Sent: Tuesday, December 23, 2014 9:52 AM
> To: Stephen Hemminger; Richardson, Bruce
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [RFC PATCH 0/7] support multi-phtread per lcore
> 
> 
> 
> > -Original Message-
> > From: Stephen Hemminger [mailto:stephen at networkplumber.org]
> > Sent: Tuesday, December 23, 2014 2:29 AM
> > To: Richardson, Bruce
> > Cc: Liang, Cunming; dev at dpdk.org
> > Subject: Re: [dpdk-dev] [RFC PATCH 0/7] support multi-phtread per lcore
> >
> > On Mon, 22 Dec 2014 09:46:03 +
> > Bruce Richardson  wrote:
> >
> > > On Mon, Dec 22, 2014 at 01:51:27AM +, Liang, Cunming wrote:
> > > > ...
> > > > > I'm conflicted on this one. However, I think far more applications 
> > > > > would be
> > > > > broken
> > > > > to start having to use thread_id in place of an lcore_id than would be
> > broken
> > > > > by having the lcore_id no longer actually correspond to a core.
> > > > > I'm actually struggling to come up with a large number of scenarios 
> > > > > where
> > it's
> > > > > important to an app to determine the cpu it's running on, compared to 
> > > > > the
> > large
> > > > > number of cases where you need to have a data-structure per thread. In
> > DPDK
> > > > > libs
> > > > > alone, you see this assumption that lcore_id == thread_id a large 
> > > > > number
> > of
> > > > > times.
> > > > >
> > > > > Despite the slight logical inconsistency, I think it's better to avoid
> > introducing
> > > > > a thread-id and continue having lcore_id representing a unique thread.
> > > > >
> > > > > /Bruce
> > > >
> > > > Ok, I understand it.
> > > > I list the implicit meaning if using lcore_id representing the unique 
> > > > thread.
> > > > 1). When lcore_id less than RTE_MAX_LCORE, it still represents the 
> > > > logical
> > core id.
> > > > 2). When lcore_id large equal than RTE_MAX_LCORE, it represents an 
> > > > unique
> > id for thread.
> > > > 3). Most of APIs(except rte_lcore_id()) in rte_lcore.h suggest to be 
> > > > used only
> > in CASE 1)
> > > > 4). rte_lcore_id() can be used in CASE 2), but the return value no 
> > > > matter
> > represent a logical core id.
> > > >
> > > > If most of us feel it's acceptable, I'll prepare for the RFC v2 base on 
> > > > this
> > conclusion.
> > > >
> > > > /Cunming
> > >
> > > Sorry, I don't like that suggestion either, as having lcore_id values 
> > > greater
> > > than RTE_MAX_LCORE is terrible, as how will people know how to dimension
> > arrays
> > > to be indexes by lcore id? Given the choice, if we are not going to just 
> > > use
> > > lcore_id as a generic thread id, which is always between 0 and
> > RTE_MAX_LCORE
> > > we can look to define a new thread_id variable to hold that. However, it 
> > > should
> > > have a bounded range.
> > > From an ease-of-porting perspective, I still think that the simplest 
> > > option is to
> > > use the existing lcore_id and accept the fact that it's now a thread id 
> > > rather
> > > than an actual physical lcore. Question is, is would that cause us lots 
> > > of issues
> > > in the future?
> > >
> > > /Bruce
> >
> > The current rte_lcore_id() has different meaning the thread. Your proposal 
> > will
> > break code that uses lcore_id to do per-cpu statistics and the lcore_config
> > code in the samples.
> > q
> [Liang, Cunming] +1.

Few more thoughts on that subject:

Actually one more place in the lib, where lcore_id is used (and it should be 
unique):
rte_spinlock_recursive_lock() / rte_spinlock_recursive_trylock().
So if we going to replace lcore_id with thread_id as uniques thread index, then 
these functions
have to be updated too.

About maintaining our own unique thread_id inside shared memory 
(_get_linear_tid()/_put_linear_tid()).
There is one thing that worries me with that approach:
In case of abnormal process termination, TIDs used by that process will remain 
'reserved'
and there is no way to know which TIDs were used by terminated process.
So there could be a situation with DPDK multi-process model,
when after secondary process abnormal termination, It wouldn't be possible to 
restart it -
we just run out of 'free' TIDs. 

Which makes me think probably there is no need to introduce new globally unique 
'thread_id'?
Might be just lcore_id is enough?  
As Mirek and Bruce suggested we can treat it a sort of 'unique thread id' 
inside EAL.
Or as 'virtual' core id that can run on set of physical cpus, and these subsets 
for different 'virtual' cores can intersect.
Then basically we can keep legacy behaviour with '-c ,' where each
lcore_id matches one to one  with physical cpu, and introduce new one, 
something like:
--lcores='()=(),..(, 
thread  with lcore_id=10 is binded to  cpu 7, and allow to run 

[dpdk-dev] [RFC PATCH 0/7] support multi-phtread per lcore

2015-01-08 Thread Richardson, Bruce
My opinion on this is that the lcore_id is rarely (if ever) used to find the 
actual core a thread is being run on. Instead it is used 99% of the time as a 
unique array index per thread, and therefore that we can keep that usage by 
just assigning a valid lcore_id to any extra threads created. The suggestion to 
get/set affinities on top of that seems a good one to me also.

/Bruce

-Original Message-
From: Ananyev, Konstantin 
Sent: Thursday, January 8, 2015 5:06 PM
To: Liang, Cunming; Stephen Hemminger; Richardson, Bruce
Cc: dev at dpdk.org
Subject: RE: [dpdk-dev] [RFC PATCH 0/7] support multi-phtread per lcore


Hi Steve,

> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Liang, Cunming
> Sent: Tuesday, December 23, 2014 9:52 AM
> To: Stephen Hemminger; Richardson, Bruce
> Cc: dev at dpdk.org
> Subject: Re: [dpdk-dev] [RFC PATCH 0/7] support multi-phtread per 
> lcore
> 
> 
> 
> > -Original Message-
> > From: Stephen Hemminger [mailto:stephen at networkplumber.org]
> > Sent: Tuesday, December 23, 2014 2:29 AM
> > To: Richardson, Bruce
> > Cc: Liang, Cunming; dev at dpdk.org
> > Subject: Re: [dpdk-dev] [RFC PATCH 0/7] support multi-phtread per 
> > lcore
> >
> > On Mon, 22 Dec 2014 09:46:03 +
> > Bruce Richardson  wrote:
> >
> > > On Mon, Dec 22, 2014 at 01:51:27AM +, Liang, Cunming wrote:
> > > > ...
> > > > > I'm conflicted on this one. However, I think far more 
> > > > > applications would be broken to start having to use thread_id 
> > > > > in place of an lcore_id than would be
> > broken
> > > > > by having the lcore_id no longer actually correspond to a core.
> > > > > I'm actually struggling to come up with a large number of 
> > > > > scenarios where
> > it's
> > > > > important to an app to determine the cpu it's running on, 
> > > > > compared to the
> > large
> > > > > number of cases where you need to have a data-structure per 
> > > > > thread. In
> > DPDK
> > > > > libs
> > > > > alone, you see this assumption that lcore_id == thread_id a 
> > > > > large number
> > of
> > > > > times.
> > > > >
> > > > > Despite the slight logical inconsistency, I think it's better 
> > > > > to avoid
> > introducing
> > > > > a thread-id and continue having lcore_id representing a unique thread.
> > > > >
> > > > > /Bruce
> > > >
> > > > Ok, I understand it.
> > > > I list the implicit meaning if using lcore_id representing the unique 
> > > > thread.
> > > > 1). When lcore_id less than RTE_MAX_LCORE, it still represents 
> > > > the logical
> > core id.
> > > > 2). When lcore_id large equal than RTE_MAX_LCORE, it represents 
> > > > an unique
> > id for thread.
> > > > 3). Most of APIs(except rte_lcore_id()) in rte_lcore.h suggest 
> > > > to be used only
> > in CASE 1)
> > > > 4). rte_lcore_id() can be used in CASE 2), but the return value 
> > > > no matter
> > represent a logical core id.
> > > >
> > > > If most of us feel it's acceptable, I'll prepare for the RFC v2 
> > > > base on this
> > conclusion.
> > > >
> > > > /Cunming
> > >
> > > Sorry, I don't like that suggestion either, as having lcore_id 
> > > values greater than RTE_MAX_LCORE is terrible, as how will people 
> > > know how to dimension
> > arrays
> > > to be indexes by lcore id? Given the choice, if we are not going 
> > > to just use lcore_id as a generic thread id, which is always 
> > > between 0 and
> > RTE_MAX_LCORE
> > > we can look to define a new thread_id variable to hold that. 
> > > However, it should have a bounded range.
> > > From an ease-of-porting perspective, I still think that the 
> > > simplest option is to use the existing lcore_id and accept the 
> > > fact that it's now a thread id rather than an actual physical 
> > > lcore. Question is, is would that cause us lots of issues in the future?
> > >
> > > /Bruce
> >
> > The current rte_lcore_id() has different meaning the thread. Your 
> > proposal will break code that uses lcore_id to do per-cpu statistics 
> > and the lcore_config code in the samples.
> > q
> [Liang, Cunming] +1.

Few more thoughts on that subject:

Actually one more place in the lib, where lcore_id is used (and it should be 
unique):
rte_spinlock_recursive_lock() / rte_spinlock_recursive_trylock().
So if we going to replace lcore_id with thread_id as uniques thread index, then 
these functions have to be updated too.

About maintaining our own unique thread_id inside shared memory 
(_get_linear_tid()/_put_linear_tid()).
There is one thing that worries me with that approach:
In case of abnormal process termination, TIDs used by that process will remain 
'reserved'
and there is no way to know which TIDs were used by terminated process.
So there could be a situation with DPDK multi-process model, when after 
secondary process abnormal termination, It wouldn't be possible to restart it - 
we just run out of 'free' TIDs. 

Which makes me think probably there is no need to introduce new globally unique 
'thread_id'?
Might be just 

[dpdk-dev] IOMMU and VF

2015-01-08 Thread Alex Markuze
Hi, Guys,
I'm trying to run a DPDK(1.7.1) application that has been previously tested
on Xen/VMware VM's. I have both iommu=pt and intel_iommu=on.
I would expect things to work as usual but unfortunately the VF I'm taking
is unable to send or receive any packets (The TXQ gets filled out, and the
packets never leave).

Looking at the demise I se this:
IOMMU: hardware identity mapping for device :83:00.0
IOMMU: hardware identity mapping for device :83:00.1

These are the bus addresses of the physical functions.
I don't know If I need to see the VF's listed here as well.

Any suggestions?


[dpdk-dev] [PATCH v5 4/6] ether: Check VMDq RSS mode

2015-01-08 Thread Vlad Zolotarov

On 01/08/15 11:19, Vlad Zolotarov wrote:
>
> On 01/07/15 08:32, Ouyang Changchun wrote:
>> Check mq mode for VMDq RSS, handle it correctly instead of returning 
>> an error;
>> Also remove the limitation of per pool queue number has max value of 
>> 1, because
>> the per pool queue number could be 2 or 4 if it is VMDq RSS mode;
>>
>> The number of rxq specified in config will determine the mq mode for 
>> VMDq RSS.
>>
>> Signed-off-by: Changchun Ouyang 
>>
>> changes in v5:
>>- Fix '<' issue, it should be '<=' to test rxq number;
>>- Extract a function to remove the embeded switch-case statement.
>>
>> ---
>>   lib/librte_ether/rte_ethdev.c | 50 
>> ++-
>>   1 file changed, 45 insertions(+), 5 deletions(-)
>>
>> diff --git a/lib/librte_ether/rte_ethdev.c 
>> b/lib/librte_ether/rte_ethdev.c
>> index 95f2ceb..8363e26 100644
>> --- a/lib/librte_ether/rte_ethdev.c
>> +++ b/lib/librte_ether/rte_ethdev.c
>> @@ -503,6 +503,31 @@ rte_eth_dev_tx_queue_config(struct rte_eth_dev 
>> *dev, uint16_t nb_queues)
>>   }
>> static int
>> +rte_eth_dev_check_vf_rss_rxq_num(uint8_t port_id, uint16_t nb_rx_q)
>> +{
>> +struct rte_eth_dev *dev = _eth_devices[port_id];
>> +switch (nb_rx_q) {
>> +case 1:
>> +case 2:
>> +RTE_ETH_DEV_SRIOV(dev).active =
>> +ETH_64_POOLS;
>> +break;
>> +case 4:
>> +RTE_ETH_DEV_SRIOV(dev).active =
>> +ETH_32_POOLS;
>> +break;
>> +default:
>> +return -EINVAL;
>> +}
>> +
>> +RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool = nb_rx_q;
>> +RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx =
>> +dev->pci_dev->max_vfs * nb_rx_q;
>> +
>> +return 0;
>> +}
>> +
>> +static int
>>   rte_eth_dev_check_mq_mode(uint8_t port_id, uint16_t nb_rx_q, 
>> uint16_t nb_tx_q,
>> const struct rte_eth_conf *dev_conf)
>>   {
>> @@ -510,8 +535,7 @@ rte_eth_dev_check_mq_mode(uint8_t port_id, 
>> uint16_t nb_rx_q, uint16_t nb_tx_q,
>> if (RTE_ETH_DEV_SRIOV(dev).active != 0) {
>>   /* check multi-queue mode */
>> -if ((dev_conf->rxmode.mq_mode == ETH_MQ_RX_RSS) ||
>> -(dev_conf->rxmode.mq_mode == ETH_MQ_RX_DCB) ||
>> +if ((dev_conf->rxmode.mq_mode == ETH_MQ_RX_DCB) ||
>>   (dev_conf->rxmode.mq_mode == ETH_MQ_RX_DCB_RSS) ||
>>   (dev_conf->txmode.mq_mode == ETH_MQ_TX_DCB)) {
>>   /* SRIOV only works in VMDq enable mode */
>> @@ -525,7 +549,6 @@ rte_eth_dev_check_mq_mode(uint8_t port_id, 
>> uint16_t nb_rx_q, uint16_t nb_tx_q,
>>   }
>> switch (dev_conf->rxmode.mq_mode) {
>> -case ETH_MQ_RX_VMDQ_RSS:
>>   case ETH_MQ_RX_VMDQ_DCB:
>>   case ETH_MQ_RX_VMDQ_DCB_RSS:
>>   /* DCB/RSS VMDQ in SRIOV mode, not implement yet */
>> @@ -534,6 +557,25 @@ rte_eth_dev_check_mq_mode(uint8_t port_id, 
>> uint16_t nb_rx_q, uint16_t nb_tx_q,
>>   "unsupported VMDQ mq_mode rx %u\n",
>>   port_id, dev_conf->rxmode.mq_mode);
>>   return (-EINVAL);
>> +case ETH_MQ_RX_RSS:
>> +PMD_DEBUG_TRACE("ethdev port_id=%" PRIu8
>> +" SRIOV active, "
>> +"Rx mq mode is changed from:"
>> +"mq_mode %u into VMDQ mq_mode %u\n",
>> +port_id,
>> +dev_conf->rxmode.mq_mode,
>> +dev->data->dev_conf.rxmode.mq_mode);
>> +case ETH_MQ_RX_VMDQ_RSS:
>> +dev->data->dev_conf.rxmode.mq_mode = ETH_MQ_RX_VMDQ_RSS;
>> +if (nb_rx_q <= RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool)
>> +if (rte_eth_dev_check_vf_rss_rxq_num(port_id, 
>> nb_rx_q) != 0) {
>> +PMD_DEBUG_TRACE("ethdev port_id=%d"
>> +" SRIOV active, invalid queue"
>> +" number for VMDQ RSS\n",
>> +port_id);
>
> Some nitpicking here: I'd add the allowed values descriptions to the 
> error message. Something like: "invalid queue number for VMDQ RSS. 
> Allowed values are 1, 2 or 4\n".
>
>> +return -EINVAL;
>> +}
>> +break;
>>   default: /* ETH_MQ_RX_VMDQ_ONLY or ETH_MQ_RX_NONE */
>>   /* if nothing mq mode configure, use default scheme */
>>   dev->data->dev_conf.rxmode.mq_mode = ETH_MQ_RX_VMDQ_ONLY;
>> @@ -553,8 +595,6 @@ rte_eth_dev_check_mq_mode(uint8_t port_id, 
>> uint16_t nb_rx_q, uint16_t nb_tx_q,
>>   default: /* ETH_MQ_TX_VMDQ_ONLY or ETH_MQ_TX_NONE */
>>   /* if nothing mq mode configure, use default scheme */
>>   dev->data->dev_conf.txmode.mq_mode = ETH_MQ_TX_VMDQ_ONLY;
>> -if (RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool > 1)
>> -RTE_ETH_DEV_SRIOV(dev).nb_q_per_pool = 1;
>
> I'm not sure u may just remove it. These lines originally belong to a 
> different flow. Are u sure u can