[ovs-dev] [PATCH 2/4] dpif-netdev: Incremental addition/deletion of PMD threads.

2017-02-21 Thread Ilya Maximets
Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/dpif-netdev.c | 119 +-
 tests/pmd.at  |   2 +-
 2 files changed, 91 insertions(+), 30 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 30907b7..6e575ab 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -48,6 +48,7 @@
 #include "fat-rwlock.h"
 #include "flow.h"
 #include "hmapx.h"
+#include "id-pool.h"
 #include "latch.h"
 #include "netdev.h"
 #include "netdev-vport.h"
@@ -241,6 +242,9 @@ struct dp_netdev {
 
 /* Stores all 'struct dp_netdev_pmd_thread's. */
 struct cmap poll_threads;
+/* id pool for per thread static_tx_qid. */
+struct id_pool *tx_qid_pool;
+struct ovs_mutex tx_qid_pool_mutex;
 
 /* Protects the access of the 'struct dp_netdev_pmd_thread'
  * instance for non-pmd thread. */
@@ -514,7 +518,7 @@ struct dp_netdev_pmd_thread {
 /* Queue id used by this pmd thread to send packets on all netdevs if
  * XPS disabled for this netdev. All static_tx_qid's are unique and less
  * than 'cmap_count(dp->poll_threads)'. */
-const int static_tx_qid;
+uint32_t static_tx_qid;
 
 struct ovs_mutex port_mutex;/* Mutex for 'poll_list' and 'tx_ports'. */
 /* List of rx queues to poll. */
@@ -594,6 +598,8 @@ static struct dp_netdev_pmd_thread 
*dp_netdev_get_pmd(struct dp_netdev *dp,
   unsigned core_id);
 static struct dp_netdev_pmd_thread *
 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
+static void dp_netdev_del_pmd(struct dp_netdev *dp,
+  struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
@@ -1077,10 +1083,17 @@ create_dp_netdev(const char *name, const struct 
dpif_class *class,
 atomic_init(>emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
 
 cmap_init(>poll_threads);
+
+ovs_mutex_init(>tx_qid_pool_mutex);
+/* We need 1 Tx queue for each possible cpu core + 1 for non-PMD threads. 
*/
+dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
+
 ovs_mutex_init_recursive(>non_pmd_mutex);
 ovsthread_key_create(>per_pmd_key, NULL);
 
 ovs_mutex_lock(>port_mutex);
+/* non-PMD will be created before all other threads and will
+ * allocate static_tx_qid = 0. */
 dp_netdev_set_nonpmd(dp);
 
 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
@@ -1164,6 +1177,9 @@ dp_netdev_free(struct dp_netdev *dp)
 dp_netdev_destroy_all_pmds(dp, true);
 cmap_destroy(>poll_threads);
 
+ovs_mutex_destroy(>tx_qid_pool_mutex);
+id_pool_destroy(dp->tx_qid_pool);
+
 ovs_mutex_destroy(>non_pmd_mutex);
 ovsthread_key_delete(dp->per_pmd_key);
 
@@ -3175,7 +3191,10 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
 {
 struct dp_netdev_pmd_thread *pmd;
 struct ovs_numa_dump *pmd_cores;
-bool changed = false;
+struct ovs_numa_info_core *core;
+struct hmapx to_delete = HMAPX_INITIALIZER(_delete);
+struct hmapx_node *node;
+int created = 0, deleted = 0;
 
 /* The pmd threads should be started only if there's a pmd port in the
  * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
@@ -3188,45 +3207,62 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
 }
 
-/* Check for changed configuration */
-if (ovs_numa_dump_count(pmd_cores) != cmap_count(>poll_threads) - 1) {
-changed = true;
-} else {
-CMAP_FOR_EACH (pmd, node, >poll_threads) {
-if (pmd->core_id != NON_PMD_CORE_ID
-&& !ovs_numa_dump_contains_core(pmd_cores,
-pmd->numa_id,
-pmd->core_id)) {
-changed = true;
-break;
-}
+/* Check for unwanted pmd threads */
+CMAP_FOR_EACH(pmd, node, >poll_threads) {
+if (pmd->core_id != NON_PMD_CORE_ID
+&a

[ovs-dev] [PATCH 1/4] id-pool: Allocate the lowest available ids.

2017-02-21 Thread Ilya Maximets
This simple change makes id-pool to always allocate the
lowest possible id from the pool. No any other code affected
because, actually, there is no users of 'id_pool_free_id' in
OVS.

This behaviour of id-pool will be used in the next patch.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/id-pool.c | 3 +++
 lib/id-pool.h | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/id-pool.c b/lib/id-pool.c
index 62a6b33..8f005e0 100644
--- a/lib/id-pool.c
+++ b/lib/id-pool.c
@@ -148,6 +148,9 @@ id_pool_free_id(struct id_pool *pool, uint32_t id)
 id_node = id_pool_find(pool, id);
 if (id_node) {
 hmap_remove(>map, _node->node);
+if (id < pool->next_free_id) {
+pool->next_free_id = id;
+}
 free(id_node);
 }
 }
diff --git a/lib/id-pool.h b/lib/id-pool.h
index 93a49c3..8721f87 100644
--- a/lib/id-pool.h
+++ b/lib/id-pool.h
@@ -35,7 +35,7 @@ void id_pool_add(struct id_pool *, uint32_t id);
  * 
  *
  * Pool of unique 32bit ids.
- *
+ * Allocation always returns the lowest available id.
  *
  * Thread-safety
  * =
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 3/4] dpif-netdev: Avoid port's reconfiguration on pmd-cpu-mask changes.

2017-02-21 Thread Ilya Maximets
Reconfiguration of HW NICs may lead to packet drops.
In current model all physical ports will be reconfigured each
time number of PMD threads changed. Since we not stopping
threads on pmd-cpu-mask changes, this patch will help to further
decrease port's downtime by setting the maximum possible number
of wanted tx queues to avoid unnecessary reconfigurations.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/dpif-netdev.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 6e575ab..e2b4f39 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3324,7 +3324,11 @@ reconfigure_datapath(struct dp_netdev *dp)
  * on the system and the user configuration. */
 reconfigure_pmd_threads(dp);
 
-wanted_txqs = cmap_count(>poll_threads);
+/* We need 1 Tx queue for each possible cpu core. */
+wanted_txqs = ovs_numa_get_n_cores();
+ovs_assert(wanted_txqs != OVS_CORE_UNSPEC);
+/* And 1 Tx queue for non-PMD threads. */
+wanted_txqs++;
 
 /* The number of pmd threads might have changed, or a port can be new:
  * adjust the txqs. */
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 0/4] Incremental addition/deletion of PMD threads.

2017-02-21 Thread Ilya Maximets
Ilya Maximets (4):
  id-pool: Allocate the lowest available ids.
  dpif-netdev: Incremental addition/deletion of PMD threads.
  dpif-netdev: Avoid port's reconfiguration on pmd-cpu-mask changes.
  dpif-netdev: Don't uninit emc on reload.

 lib/dpif-netdev.c | 129 --
 lib/id-pool.c |   3 ++
 lib/id-pool.h |   2 +-
 tests/pmd.at  |   2 +-
 4 files changed, 102 insertions(+), 34 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v4] dpif-netdev: Conditional EMC insert

2017-02-09 Thread Ilya Maximets
Hi, Ciara.

> +EMC Insertion Probability
> +-
> +By default 1 in every 100 flows are inserted into the Exact Match Cache 
> (EMC).
> +It is possible to change this insertion probability by setting the
> +``emc-insert-prob`` option::
> +
> +$ ovs-vsctl --no-wait set Open_vSwitch . other_config:emc-insert-prob=N
> +
> +where:
> +
> +``N``
> +  is a positive integer between 0 and 4294967295 (maximum unsigned 32bit 
> int).
> +
> +If ``N`` is set to 1, an insertion will be performed for every flow. The 
> lower
> +the value of ``emc-insert-prob`` the higher the probability of insertion,
> +except for the value 0 which will result in no insertions being performed and
> +thus essentially disabling the EMC.

I totally don't understand why 'N' is an integer between 0 and 4294967295.
Why we can't just use 0-100% for that config?

Probability in math is a value between 0 and 1. We can use an integer values
from 0 to 100 in percents. IMHO, that strange configuration kind of misleading
because 'emc-insert-prob' reads as 'emc-insert-probability' but means something
completely different. Also, does all of this values from 0 to 4294967295 are 
useful?

What do you think?

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 1/2] netdev-dpdk: Use intermediate queue during packet transmission.

2017-01-17 Thread Ilya Maximets
Not a complete review. This code is full of races.
See details inline.

Best regards, Ilya Maximets.

On 17.01.2017 18:37, antonio.fische...@intel.com wrote:
> This patch implements the intermediate Tx queues on 'dpdk' type ports.
> 
> Test results:
>  * In worst case scenario with fewer packets per batch, a significant
>bottleneck is observed for netdev_dpdk_eth_send() function due to
>expensive MMIO writes.
> 
>  * Also its observed that CPI(cycles per instruction) Rate for the function
>stood between 3.15 and 4.1 which is significantly higher than acceptable
>limit of 1.0 for HPC applications and theoretical limit of 0.25 (As Backend
>pipeline can retire 4 micro-operations in a cycle).
> 
>  * With this patch, CPI for netdev_dpdk_eth_send() is at 0.55 and the overall
>throughput improved significantly.
> 
> 
> Signed-off-by: Antonio Fischetti <antonio.fische...@intel.com>
> Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodire...@intel.com>
> Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodire...@intel.com>
> Signed-off-by: Markus Magnusson <markus.magnus...@ericsson.com>
> Co-authored-by: Markus Magnusson <markus.magnus...@ericsson.com>
> ---
>  lib/dpif-netdev.c | 53 +++--
>  lib/netdev-bsd.c  |  1 +
>  lib/netdev-dpdk.c | 82 
> ++-
>  lib/netdev-dummy.c|  1 +
>  lib/netdev-linux.c|  1 +
>  lib/netdev-provider.h |  8 +
>  lib/netdev-vport.c|  3 +-
>  lib/netdev.c  |  9 ++
>  lib/netdev.h  |  1 +
>  9 files changed, 149 insertions(+), 10 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 3901129..58ac429 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -289,6 +289,8 @@ struct dp_netdev_rxq {
>  struct dp_netdev_pmd_thread *pmd;  /* pmd thread that will poll this 
> queue. */
>  };
>  
> +#define LAST_USED_QID_NONE -1
> +
>  /* A port in a netdev-based datapath. */
>  struct dp_netdev_port {
>  odp_port_t port_no;
> @@ -303,6 +305,8 @@ struct dp_netdev_port {
>  char *type; /* Port type as requested by user. */
>  char *rxq_affinity_list;/* Requested affinity of rx queues. */
>  bool need_reconfigure;  /* True if we should reconfigure netdev. */
> +int last_used_qid;  /* Last queue id where packets could be
> +   enqueued. */
>  };
>  
>  /* Contained by struct dp_netdev_flow's 'stats' member.  */
> @@ -619,6 +623,9 @@ static int dpif_netdev_xps_get_tx_qid(const struct 
> dp_netdev_pmd_thread *pmd,
>  static inline bool emc_entry_alive(struct emc_entry *ce);
>  static void emc_clear_entry(struct emc_entry *ce);
>  
> +static struct tx_port *pmd_send_port_cache_lookup
> +(const struct dp_netdev_pmd_thread *pmd, odp_port_t port_no);
> +
>  static void
>  emc_cache_init(struct emc_cache *flow_cache)
>  {
> @@ -3507,15 +3514,19 @@ pmd_load_queues_and_ports(struct dp_netdev_pmd_thread 
> *pmd,
>  return i;
>  }
>  
> +enum { DRAIN_TSC = 2ULL };
> +
>  static void *
>  pmd_thread_main(void *f_)
>  {
>  struct dp_netdev_pmd_thread *pmd = f_;
> -unsigned int lc = 0;
> +unsigned int lc = 0, lc_drain = 0;
>  struct polled_queue *poll_list;
>  bool exiting;
>  int poll_cnt;
>  int i;
> +uint64_t prev = 0, now = 0;
> +struct tx_port *tx_port;
>  
>  poll_list = NULL;
>  
> @@ -3548,6 +3559,26 @@ reload:
> poll_list[i].port_no);
>  }
>  
> +#define MAX_LOOP_TO_DRAIN 128
> +if (lc_drain++ > MAX_LOOP_TO_DRAIN) {
> +lc_drain = 0;
> +prev = now;
> +now = pmd->last_cycles;
> +if ((now - prev) > DRAIN_TSC) {
> +HMAP_FOR_EACH (tx_port, node, >tx_ports) {

'pmd->tx_ports' must be protected by 'pmd->port_mutex'. Also it can be changed
while pmd still working. I think you wanted something like 
'pmd->send_port_cache'.

> +if (tx_port->port->last_used_qid != LAST_USED_QID_NONE) {
> +/* This queue may contain some buffered packets 
> waiting
> + * to be sent out. */
> +netdev_txq_drain(tx_port->port->netdev,
> +tx_port->port->last_used_qid,
> +tx_port->port->dynamic_txqs);
> +/* Mark it as empty. */
> +tx_port->port->last_used_qid = LAST_USED_QID_NONE;

'port' is a 

[ovs-dev] [PATCH] dpdk: Redirect DPDK log to OVS logging subsystem.

2017-03-02 Thread Ilya Maximets
This should be helpful for have all the logs in one place.
'ovs-appctl vlog' commands for 'dpdk' module can be used
to configure the log level. Lower bound for DPDK logging
(--log-level) still can be passed through 'dpdk-extra' field.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 NEWS   |  5 +
 lib/dpdk.c | 51 +++
 2 files changed, 56 insertions(+)

diff --git a/NEWS b/NEWS
index ce9fe88..8d4af9e 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,11 @@ Post-v2.7.0
`egress_pkt_mark` OVSDB option.
- EMC insertion probability is reduced to 1% and is configurable via
  the new 'other_config:emc-insert-inv-prob' option.
+   - DPDK:
+ * DPDK log messages redirected to OVS logging subsystem.
+   Log level can be changed in a usual OVS way using
+   'ovs-appctl vlog' commands for 'dpdk' module. Lower bound
+   still can be configured via extra arguments for DPDK EAL.
 
 v2.7.0 - xx xxx 
 -
diff --git a/lib/dpdk.c b/lib/dpdk.c
index c1626e2..eb03ec9 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -17,10 +17,12 @@
 #include 
 #include "dpdk.h"
 
+#include 
 #include 
 #include 
 #include 
 
+#include 
 #include 
 #ifdef DPDK_PDUMP
 #include 
@@ -36,6 +38,8 @@
 
 VLOG_DEFINE_THIS_MODULE(dpdk);
 
+static FILE *log_stream = NULL;   /* Stream for DPDK log redirection */
+
 static char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets */
 
 static int
@@ -262,6 +266,45 @@ argv_release(char **dpdk_argv, char **dpdk_argv_release, 
size_t dpdk_argc)
 free(dpdk_argv);
 }
 
+static ssize_t
+dpdk_log_write(void *c OVS_UNUSED, const char *buf, size_t size)
+{
+char *str = xmalloc(size + 1);
+
+strncpy(str, buf, size);
+str[size] = '\0';
+
+switch (rte_log_cur_msg_loglevel()) {
+case RTE_LOG_DEBUG:
+VLOG_DBG("%s", str);
+break;
+case RTE_LOG_INFO:
+case RTE_LOG_NOTICE:
+VLOG_INFO("%s", str);
+break;
+case RTE_LOG_WARNING:
+VLOG_WARN("%s", str);
+break;
+case RTE_LOG_ERR:
+VLOG_ERR("%s", str);
+break;
+case RTE_LOG_CRIT:
+case RTE_LOG_ALERT:
+case RTE_LOG_EMERG:
+VLOG_EMER("%s", str);
+break;
+default:
+OVS_NOT_REACHED();
+}
+
+free(str);
+return size;
+}
+
+static cookie_io_functions_t dpdk_log_func = {
+.write = dpdk_log_write,
+};
+
 static void
 dpdk_init__(const struct smap *ovs_other_config)
 {
@@ -273,6 +316,14 @@ dpdk_init__(const struct smap *ovs_other_config)
 cpu_set_t cpuset;
 char *sock_dir_subcomponent;
 
+log_stream = fopencookie(NULL, "w+", dpdk_log_func);
+if (log_stream == NULL) {
+VLOG_ERR("Can't redirect DPDK log: %s.", ovs_strerror(errno));
+} else {
+setbuf(log_stream, NULL);
+rte_openlog_stream(log_stream);
+}
+
 if (process_vhost_flags("vhost-sock-dir", ovs_rundir(),
 NAME_MAX, ovs_other_config,
 _dir_subcomponent)) {
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] dpdk: Fix abort on double free.

2016-11-28 Thread Ilya Maximets
According to DPDK API (lib/librte_eal/common/include/rte_eal.h):

"After the call to rte_eal_init(), all arguments argv[x]
 with x < ret may be modified and should not be accessed
 by the application."

This means, that OVS must not free the arguments passed to DPDK.
In real world, 'rte_eal_init()' replaces the last argument in
'dpdk_argv' with the first one by doing this:

# eal_parse_args() from lib/librte_eal/linuxapp/eal/eal.c

char *prgname = argv[0];
...
if (optind >= 0)
argv[optind-1] = prgname;

This leads to double free inside 'deferred_argv_release()' and
possible ABORT at exit:

*** Error in `ovs-vswitchd': double free or corruption (fasttop) <...> ***
Program received signal SIGABRT, Aborted.

#0  raise () from /lib64/libc.so.6
#1  abort () from /lib64/libc.so.6
#2  __libc_message () from /lib64/libc.so.6
#3  free () from /lib64/libc.so.6
#4  deferred_argv_release () at lib/dpdk.c:261
#5  __run_exit_handlers () from /lib64/libc.so.6
#6  exit () from /lib64/libc.so.6
#7  __libc_start_main () from /lib64/libc.so.6
#8  _start ()

Fix that by not calling free for the memory passed to DPDK.

CC: Aaron Conole <acon...@redhat.com>
Fixes: bab694097133 ("netdev-dpdk: Convert initialization from cmdline to db")
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/dpdk.c | 13 -
 1 file changed, 13 deletions(-)

diff --git a/lib/dpdk.c b/lib/dpdk.c
index 49a589a..2014946 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -254,17 +254,6 @@ static char **dpdk_argv;
 static int dpdk_argc;
 
 static void
-deferred_argv_release(void)
-{
-int result;
-for (result = 0; result < dpdk_argc; ++result) {
-free(dpdk_argv[result]);
-}
-
-free(dpdk_argv);
-}
-
-static void
 dpdk_init__(const struct smap *ovs_other_config)
 {
 char **argv = NULL;
@@ -384,8 +373,6 @@ dpdk_init__(const struct smap *ovs_other_config)
 dpdk_argv = argv;
 dpdk_argc = argc;
 
-atexit(deferred_argv_release);
-
 rte_memzone_dump(stdout);
 
 /* We are called from the main thread here */
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] dpdk: Fix abort on double free.

2016-11-28 Thread Ilya Maximets
On 28.11.2016 21:55, Aaron Conole wrote:
> Ilya Maximets <i.maxim...@samsung.com> writes:
> 
>> According to DPDK API (lib/librte_eal/common/include/rte_eal.h):
>>
>>  "After the call to rte_eal_init(), all arguments argv[x]
>>   with x < ret may be modified and should not be accessed
>>   by the application."
>>
>> This means, that OVS must not free the arguments passed to DPDK.
>> In real world, 'rte_eal_init()' replaces the last argument in
>> 'dpdk_argv' with the first one by doing this:
> 
> Thanks for spotting this error, Ilya.
> 
>>  # eal_parse_args() from lib/librte_eal/linuxapp/eal/eal.c
>>
>>  char *prgname = argv[0];
>>  ...
>>  if (optind >= 0)
>>  argv[optind-1] = prgname;
>>
>> This leads to double free inside 'deferred_argv_release()' and
>> possible ABORT at exit:
> 
> I haven't seen this, which is both shocking and scary - the commit which
> does this copy is almost 4 years old;  did you have to do anything
> specific for this behavior to occur?  Did something change in DPDK
> recently that exposed this behavior?  Just wondering how you reproduced
> it.

Abort was caught up accidentally. I'm able to reproduce it on my a
little unusual testing system (ARMv8 + Fedora 21 + clang 3.5) without
any specific manipulations. The bug exists always but it's hard
for libc to detect double free here because there are many other
frees/allocations at exit time. I've used following patch to confirm
the issue if it wasn't detected by libc:

diff --git a/lib/dpdk.c b/lib/dpdk.c
index 49a589a..65d2d28 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -258,6 +258,8 @@ deferred_argv_release(void)
 {
 int result;
 for (result = 0; result < dpdk_argc; ++result) {
+VLOG_INFO("DPDK ARGV release: %2d: 0x%" PRIx64 " (%s)",
+  result, (intptr_t)dpdk_argv[result], dpdk_argv[result]);
 free(dpdk_argv[result]);
 }
 


> 
>> *** Error in `ovs-vswitchd': double free or corruption (fasttop) <...> ***
>>  Program received signal SIGABRT, Aborted.
>>
>>  #0  raise () from /lib64/libc.so.6
>>  #1  abort () from /lib64/libc.so.6
>>  #2  __libc_message () from /lib64/libc.so.6
>>  #3  free () from /lib64/libc.so.6
>>  #4  deferred_argv_release () at lib/dpdk.c:261
>>  #5  __run_exit_handlers () from /lib64/libc.so.6
>>  #6  exit () from /lib64/libc.so.6
>>  #7  __libc_start_main () from /lib64/libc.so.6
>>  #8  _start ()
>>
>> Fix that by not calling free for the memory passed to DPDK.
>>
>> CC: Aaron Conole <acon...@redhat.com>
>> Fixes: bab694097133 ("netdev-dpdk: Convert initialization from cmdline to 
>> db")
>> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
>> ---
> 
> We need to free the memory - I think that is not a question;

Actually, it is. According to DPDK API (see above) 'rte_eal_init()'
takes the ownership of 'argv'. This means that we must not free
or use this memory.

Some thoughts:
DPDK internally doesn't free this memory, but it's not the reason to
touch it from the outside. Actually, DPDK API change required here to
support freeing of this resources if needed. But until there is no
'rte_eal_uninit()' such API change isn't actually useful.

Also, I forget to remove the variables. So, the following incremental
to my original patch required:


diff --git a/lib/dpdk.c b/lib/dpdk.c
index 2014946..4201149 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -250,9 +250,6 @@ get_dpdk_args(const struct smap *ovs_other_config, char 
***argv,
 return i + extra_argc;
 }
 
-static char **dpdk_argv;
-static int dpdk_argc;
-
 static void
 dpdk_init__(const struct smap *ovs_other_config)
 {
@@ -370,9 +367,6 @@ dpdk_init__(const struct smap *ovs_other_config)
 }
 }
 
-dpdk_argv = argv;
-dpdk_argc = argc;
-
 rte_memzone_dump(stdout);
 
 /* We are called from the main thread here */


Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [ovs-dev, 16/17] dpif-netdev: Centralized threads and queues handling code.

2016-11-24 Thread Ilya Maximets
Hi, Daniele.
This is not a complete review. I'll continue to checking this.

Few high level comments:

*  Did you thought about splitting 'reconfigure_datapath' apart?
   It's a very big function.

*  IMHO, 'reload_all_pmds' is a misleading name for a function
   which reloads only selected threads.

Also, few comments inline.

Best regards, Ilya Maximets.

On 16.11.2016 03:46, Daniele Di Proietto wrote:
> Currently we have three different code paths that deal with pmd threads
> and queues, in response to different input
> 
> 1. When a port is added
> 2. When a port is deleted
> 3. When the cpumask changes or a port must be reconfigured.
> 
> 1. and 2. are carefully written to minimize disruption to the running
> datapath, while 3. brings down all the threads reconfigure all the ports
> and restarts everything.
> 
> This commit removes the three separate code paths by introducing the
> reconfigure_datapath() function, that takes care of adapting the pmd
> threads and queues to the current datapath configuration, no matter how
> we got there.
> 
> This aims at simplifying mantenaince and introduces a long overdue
> improvement: port reconfiguration (can happen quite frequently for
> dpdkvhost ports) is now done without shutting down the whole datapath,
> but just by temporarily removing the port that needs to be reconfigured
> (while the rest of the datapath is running).
> 
> We now also recompute the rxq scheduling from scratch every time a port
> is added of deleted.  This means that the queues will be more balanced,
> especially when dealing with explicit rxq-affinity from the user
> (without shutting down the threads and restarting them), but it also
> means that adding or deleting a port might cause existing queues to be
> moved between pmd threads.  This negative effect can be avoided by
> taking into account the existing distribution when computing the new
> scheduling, but I considered code clarity and fast reconfiguration more
> important than optimizing port addition or removal (a port is added and
> removed only once, but can be reconfigured many times)
> 
> Lastly, this commit moves the pmd threads state away from ovs-numa.  Now
> the pmd threads state is kept only in dpif-netdev.
> 
> Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
> ---
>  lib/dpif-netdev.c | 899 
> +++---
>  tests/pmd.at  |   3 +-
>  2 files changed, 453 insertions(+), 449 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 0a88df3..93de684 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -289,6 +289,7 @@ struct dp_netdev_rxq {
>pinned. RXQ_CORE_UNPINNED if the
>queue doesn't need to be pinned to 
> a
>particular core. */
> +struct dp_netdev_pmd_thread *pmd;  /* pmd thread that will poll this 
> queue. */
>  };
>  
>  /* A port in a netdev-based datapath. */
> @@ -304,6 +305,7 @@ struct dp_netdev_port {
>  struct ovs_mutex txq_used_mutex;
>  char *type; /* Port type as requested by user. */
>  char *rxq_affinity_list;/* Requested affinity of rx queues. */
> +bool need_reconfigure;  /* True if we should reconfigure netdev. */
>  };
>  
>  /* Contained by struct dp_netdev_flow's 'stats' member.  */
> @@ -506,7 +508,7 @@ struct dp_netdev_pmd_thread {
>  
>  /* Queue id used by this pmd thread to send packets on all netdevs if
>   * XPS disabled for this netdev. All static_tx_qid's are unique and less
> - * than 'ovs_numa_get_n_cores() + 1'. */
> + * than 'cmap_count(dp->poll_threads)'. */
>  const int static_tx_qid;
>  
>  struct ovs_mutex port_mutex;/* Mutex for 'poll_list' and 'tx_ports'. 
> */
> @@ -536,6 +538,9 @@ struct dp_netdev_pmd_thread {
>   * reporting to the user */
>  unsigned long long stats_zero[DP_N_STATS];
>  uint64_t cycles_zero[PMD_N_CYCLES];
> +
> +/* Set to true if the pmd thread needs to be reloaded. */
> +bool need_reload;
>  };
>  
>  /* Interface to netdev-based datapath. */
> @@ -580,29 +585,26 @@ static void dp_netdev_destroy_pmd(struct 
> dp_netdev_pmd_thread *pmd);
>  static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
>  OVS_REQUIRES(dp->port_mutex);
>  
> +static void *pmd_thread_main(void *);
>  static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
>unsigned core_id);
>  static struct dp_netdev_pmd_thread *
>  dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
> -static v

Re: [ovs-dev] [ovs-dev, 03/17] dpif-netdev: Don't try to output on a device without txqs.

2016-11-21 Thread Ilya Maximets
On 16.11.2016 03:45, Daniele Di Proietto wrote:
> Tunnel devices have 0 txqs and don't support netdev_send().  While
> netdev_send() simply returns EOPNOTSUPP, the XPS logic is still executed
> on output, and that might be confused by devices with no txqs.
> 
> It seems better to have different structures in the fast path for ports
> that support netdev_{push,pop}_header (tunnel devices), and ports that
> support netdev_send.  With this we can also remove a branch in
> netdev_send().
> 
> This is also necessary for a future commit, which starts DPDK devices
> without txqs.
> 
> Signed-off-by: Daniele Di Proietto 
> ---
>  lib/dpif-netdev.c | 72 
> ---
>  lib/netdev.c  | 15 
>  lib/netdev.h  |  1 +
>  3 files changed, 64 insertions(+), 24 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 7b67b42..81366b2 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -422,7 +422,8 @@ struct rxq_poll {
>  struct ovs_list node;
>  };
>  
> -/* Contained by struct dp_netdev_pmd_thread's 'port_cache' or 'tx_ports'. */
> +/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
> + * 'tnl_port_cache' or 'tx_ports'. */
>  struct tx_port {
>  struct dp_netdev_port *port;
>  int qid;
> @@ -504,11 +505,19 @@ struct dp_netdev_pmd_thread {
>   * read by the pmd thread. */
>  struct hmap tx_ports OVS_GUARDED;
>  
> -/* Map of 'tx_port' used in the fast path. This is a thread-local copy of
> - * 'tx_ports'. The instance for cpu core NON_PMD_CORE_ID can be accessed
> - * by multiple threads, and thusly need to be protected by 
> 'non_pmd_mutex'.
> - * Every other instance will only be accessed by its own pmd thread. */
> -struct hmap port_cache;
> +/* These are thread-local copies of 'tx_ports'.  One contains only tunnel
> + * ports (that support push_tunnel/pop_tunnel)  The other contains
> + * non-tunnel ports (that support send).
> + *
> + * These are kept separate to make sure that we don't try to execute
> + * OUTPUT on a tunnel device (which has 0 txqs) or PUSH/POP on a 
> non-tunnel
> + * device.
> + *
> + * The instance for cpu core NON_PMD_CORE_ID can be accessed by multiple
> + * threads, and thusly needs to be protected by 'non_pmd_mutex'.  Every
> + * other instance will only be accessed by its own pmd thread. */
> +struct hmap tnl_port_cache;
> +struct hmap send_port_cache;
>  
>  /* Only a pmd thread can write on its own 'cycles' and 'stats'.
>   * The main thread keeps 'stats_zero' and 'cycles_zero' as base
> @@ -3055,7 +3064,10 @@ pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
>  /* Free all used tx queue ids. */
>  dpif_netdev_xps_revalidate_pmd(pmd, 0, true);
>  
> -HMAP_FOR_EACH_POP (tx_port_cached, node, >port_cache) {
> +HMAP_FOR_EACH_POP (tx_port_cached, node, >tnl_port_cache) {
> +free(tx_port_cached);
> +}
> +HMAP_FOR_EACH_POP (tx_port_cached, node, >send_port_cache) {
>  free(tx_port_cached);
>  }
>  }
> @@ -3069,11 +3081,22 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread 
> *pmd)
>  struct tx_port *tx_port, *tx_port_cached;
>  
>  pmd_free_cached_ports(pmd);
> -hmap_shrink(>port_cache);
> +hmap_shrink(>send_port_cache);
> +hmap_shrink(>tnl_port_cache);
>  
>  HMAP_FOR_EACH (tx_port, node, >tx_ports) {
> +struct hmap *cache;
> +
> +if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
> +cache = >tnl_port_cache;
> +} else if (netdev_n_txq(tx_port->port->netdev)) {
> +cache = >send_port_cache;
> +} else {
> +continue;
> +}

IMHO, this code introduces artificial limitations for netdev.
What about something like this:

if (has_pop _OR_ has_push) {
insert to 'tnl_port_cache';
}

if (netdev_n_txq(tx_port->port->netdev)) {
insert to 'send_port_cache';
}
?
i.e make all the checks independent.

Otherwise, it must be described in 'netdev-provider.h' that netdev
can have only tunnel related functions (both 'push' and 'pop') or
send function.

> +
>  tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
> -hmap_insert(>port_cache, _port_cached->node,
> +hmap_insert(cache, _port_cached->node,
>  hash_port_no(tx_port_cached->port->port_no));
>  }
>  }
> @@ -3309,7 +3332,8 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread 
> *pmd, struct dp_netdev *dp,
>  pmd->next_optimization = time_msec() + DPCLS_OPTIMIZATION_INTERVAL;
>  ovs_list_init(>poll_list);
>  hmap_init(>tx_ports);
> -hmap_init(>port_cache);
> +hmap_init(>tnl_port_cache);
> +hmap_init(>send_port_cache);
>  /* init the 'flow_cache' since there is no
>   * actual thread created for NON_PMD_CORE_ID. */
>  if (core_id == 

[ovs-dev] [PATCH] netdev: Set the default number of queues at removal from the database

2016-12-08 Thread Ilya Maximets
Expected behavior for attribute removal from the database is
resetting it to default value. Currently this doesn't work for
n_rxq/n_txq options of pmd netdevs (last requested value used):

# ovs-vsctl set interface dpdk0 options:n_rxq=4
# ovs-vsctl remove interface dpdk0 options n_rxq
# ovs-appctl dpif/show | grep dpdk0
  <...>
  dpdk0 1/1: (dpdk: configured_rx_queues=4, <...> \
requested_rx_queues=4,  <...>)

Fix that by using NR_QUEUE or 1 as a default value for 'smap_get_int'.

Fixes: a14b8947fd13 ("dpif-netdev: Allow different numbers of
  rx queues for different ports.")
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/netdev-dpdk.c  | 2 +-
 lib/netdev-dummy.c | 4 ++--
 tests/pmd.at   | 7 +++
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 61d7aa3..625f425 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -1084,7 +1084,7 @@ dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct 
smap *args)
 {
 int new_n_rxq;
 
-new_n_rxq = MAX(smap_get_int(args, "n_rxq", dev->requested_n_rxq), 1);
+new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
 if (new_n_rxq != dev->requested_n_rxq) {
 dev->requested_n_rxq = new_n_rxq;
 netdev_request_reconfigure(>up);
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index dec1a8e..de74846 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -868,8 +868,8 @@ netdev_dummy_set_config(struct netdev *netdev_, const 
struct smap *args)
 goto exit;
 }
 
-new_n_rxq = MAX(smap_get_int(args, "n_rxq", netdev->requested_n_rxq), 1);
-new_n_txq = MAX(smap_get_int(args, "n_txq", netdev->requested_n_txq), 1);
+new_n_rxq = MAX(smap_get_int(args, "n_rxq", 1), 1);
+new_n_txq = MAX(smap_get_int(args, "n_txq", 1), 1);
 new_numa_id = smap_get_int(args, "numa_id", 0);
 if (new_n_rxq != netdev->requested_n_rxq
 || new_n_txq != netdev->requested_n_txq
diff --git a/tests/pmd.at b/tests/pmd.at
index 8f05d74..7d3fa0d 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -259,6 +259,13 @@ NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=42 
in_port=1 (via action) data_le
 
icmp,vlan_tci=0x,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0
 icmp_csum:f7ff
 ])
 
+dnl Check resetting to default number of rx queues after removal from the db.
+AT_CHECK([ovs-vsctl remove interface p1 options n_rxq])
+
+AT_CHECK([ovs-appctl dpif/show | grep p1 | sed 
's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl
+   p1 1/1: (dummy-pmd: configured_rx_queues=1, 
configured_tx_queues=, requested_rx_queues=1, 
requested_tx_queues=)
+])
+
 OVS_VSWITCHD_STOP
 AT_CLEANUP
 
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v3 00/18] DPDK/pmd reconfiguration refactor and bugfixes

2017-01-12 Thread Ilya Maximets
Hi, Daniele.
Thanks for v3.

Acked-by: Ilya Maximets <i.maxim...@samsung.com>

On 09.01.2017 06:14, Daniele Di Proietto wrote:
> The first two commits of the series are trivial bugfixes for dpif-netdev.
> 
> Then the series fixes a long standing bug that caused a crash when the
> admin link state of a port is changed while traffic is flowing.
> 
> The next part makes use of reconfiguration for port add: this makes
> the operation twice as fast and reduce some code duplication.  This part
> conflicts with the port naming change, so I'm willing to postpone it, unless
> we find it to be useful for the port naming change.
> 
> The rest of the series refactors a lot of code if dpif-netdev:
> 
> * We no longer start pmd threads on demand for each numa node.  This made
>   the code very complicated and introduced a lot of bugs.
> * The pmd threads state is now internal to dpif-netdev and it's not stored in
>   ovs-numa.
> * There's now a single function that handles pmd threads/ports changes: this
>   reduces code duplication and makes port reconfiguration faster, as we don't
>   have to bring down the whole datapath.
> 
> v3->v2:
> 
> * Rebased:
>   * Rebased against dpdk arbitrary name change.
>   * Dropped unsigned 'core_id' commit because a similar fix is already
> on master
> * Put space between *FOR_EACH* and (
> * Actually use new FOR_EACH_NUMA_ON_DUMP
> * Use hmap_contains() instead of dp_netdev_lookup_port() in a couple of
>   places
> * Restore spaces in log messages, lost while wrapping the string.
> 
> v1->v2:
> 
> * Postpone cls deletion in dp_netdev_destroy_pmd()
> * Allow ports to be in tnl_port_cache and send_port_cache at the same time
> * Set counter to 1025 when reloading pmd without queues to be polled
> * Rebased:
>   * Allow 0x in pmd-cpu-mask
>   * ...
> * Don't duplicate get_core_by_core_id() in get_cpu_core()
> * New commit for ovs-numa: don't use hmap_first_with_hash()
> * Keep per numa count of cores in ovs_numa_dump
> * Print queue id and port name in warning if there's no pmd thread
> * Extract pmd_remove_stale_ports() from reconfigure_datapath()
> * s/reload_all_pmds()/reload_affected_pmds()/
> * Declare variables at the beginning of the block in rxq_scheduling()
> * Use 'q' instead of 'port->rxqs[qid]' in a couple of places
> * Unref pmd in rxq_scheduling()
> * Simplify check for changed pmd threads
> * Properly reset queues to unassigned in reconfigure_datapath()
> * Optimize tx port insertion in pmd cache
> 
> 
> Daniele Di Proietto (18):
>   dpif-netdev: Fix memory leak.
>   dpif-netdev: Take non_pmd_mutex to access tx cached ports.
>   dpif-netdev: Don't try to output on a device without txqs.
>   netdev-dpdk: Don't call rte_dev_stop() in update_flags().
>   netdev-dpdk: Start also dpdkr devices only once on port-add.
>   netdev-dpdk: Refactor construct and destruct.
>   dpif-netdev: Use a boolean instead of pmd->port_seq.
>   dpif-netdev: Block pmd threads if there are no ports.
>   dpif-netdev: Create pmd threads for every numa node.
>   dpif-netdev: Make 'static_tx_qid' const.
>   dpctl: Avoid making assumptions on pmd threads.
>   ovs-numa: New ovs_numa_dump_contains_core() function.
>   ovs-numa: Add new dump types.
>   ovs-numa: Don't use hmap_first_with_hash().
>   ovs-numa: Add per numa and global counts in dump.
>   dpif-netdev: Use hmap for poll_list in pmd threads.
>   dpif-netdev: Centralized threads and queues handling code.
>   ovs-numa: Remove unused functions.
> 
>  lib/dpctl.c   |  107 +---
>  lib/dpif-netdev.c | 1427 
> -
>  lib/dpif.c|6 +-
>  lib/dpif.h|   12 +-
>  lib/netdev-dpdk.c |  170 +++
>  lib/netdev.c  |   41 +-
>  lib/netdev.h  |1 +
>  lib/ovs-numa.c|  284 +--
>  lib/ovs-numa.h|   35 +-
>  tests/pmd.at  |   49 +-
>  vswitchd/bridge.c |2 +
>  11 files changed, 1079 insertions(+), 1055 deletions(-)
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] netdev-dpdk: Don't use dev->vhost_id without mutex.

2016-12-05 Thread Ilya Maximets
The copy should be used here.
Additionally, 'strlen' changed to the faster check.

Fixes: 821b86649a90 ("netdev-dpdk: Don't try to unregister empty vhost_id.")
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
Version 2:
* 'strlen' --> '[0]' (Suggested by Ben Pfaff)

 lib/netdev-dpdk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 6e5cd43..61d7aa3 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -1027,7 +1027,7 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev)
 ovs_mutex_unlock(>mutex);
 ovs_mutex_unlock(_mutex);
 
-if (!strlen(dev->vhost_id)) {
+if (!vhost_id[0]) {
 goto out;
 }
 
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] netdev-dpdk: Don't use dev->vhost_id without mutex.

2016-12-05 Thread Ilya Maximets
> How about "if (!vhost_id[0])", to avoid a useless strlen call?

LGTM, I've posted v2 with that fix:
https://mail.openvswitch.org/pipermail/ovs-dev/2016-December/325807.html

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] netdev-dpdk: Don't use dev->vhost_id without mutex.

2016-12-04 Thread Ilya Maximets
The copy should be used here.

Fixes: 821b86649a90 ("netdev-dpdk: Don't try to unregister empty vhost_id.")
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/netdev-dpdk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 6e5cd43..e06aa28 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -1027,7 +1027,7 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev)
 ovs_mutex_unlock(>mutex);
 ovs_mutex_unlock(_mutex);
 
-if (!strlen(dev->vhost_id)) {
+if (!strlen(vhost_id)) {
 goto out;
 }
 
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 2/3] dpif-netdev: Optimize txq assignment.

2016-12-02 Thread Ilya Maximets
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/dpif-netdev.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 8e9a623..8af2811 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3317,13 +3317,15 @@ reconfigure_datapath(struct dp_netdev *dp)
 }
 
 /* Add every port to the tx cache of every pmd thread, if it's not
- * there already. */
-HMAP_FOR_EACH(port, node, >ports) {
-CMAP_FOR_EACH(pmd, node, >poll_threads) {
-ovs_mutex_lock(>port_mutex);
-dp_netdev_add_port_tx_to_pmd(pmd, port);
-ovs_mutex_unlock(>port_mutex);
+ * there already and if this pmd has at least one rxq to poll. */
+CMAP_FOR_EACH(pmd, node, >poll_threads) {
+ovs_mutex_lock(>port_mutex);
+if (hmap_count(>poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
+HMAP_FOR_EACH(port, node, >ports) {
+dp_netdev_add_port_tx_to_pmd(pmd, port);
+}
 }
+ovs_mutex_unlock(>port_mutex);
 }
 
 /* Reload affected pmd threads. */
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 0/3] Few more incrementals for reconfiguration refactoring

2016-12-02 Thread Ilya Maximets
Hi, Daniele.
I've prepared few more changes to your patch-set
"[PATCH 00/17] DPDK/pmd reconfiguration refactor and bugfixes".

Some of this changes can be squashed to your patches, or, maybe,
some of them should go separately. I'd like to hear your
opinion about this changes. They are prepared on top of my previous
incremental + your change for PMD/ports resetting. I hope, it'll
be easy to apply them on top of your current version of code.

Also, I'd like to see v2 for the path-set to be able to review or
prepare new changes for it.

Best regards, Ilya Maximets.

Ilya Maximets (3):
  dpif-netdev: Use do_del_port in reconfigure_datapath.
  dpif-netdev: Optimize txq assignment.
  dpif-netdev: Avoid introducing of port->need_reconfigure.

 lib/dpif-netdev.c | 78 ---
 1 file changed, 40 insertions(+), 38 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 18/19] dpif-netdev: Centralized threads and queues handling code.

2016-12-06 Thread Ilya Maximets
On 03.12.2016 05:14, Daniele Di Proietto wrote:
> +/* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
> + * queues and marks the pmds as isolated.  Otherwise, assign non isolated
> + * pmds to unpinned queues.
> + *
> + * The function doesn't touch the pmd threads, it just stores the assignment
> + * in the 'pmd' member of each rxq. */
> +static void
> +rxq_scheduling(struct dp_netdev *dp, bool pinned) 
> OVS_REQUIRES(dp->port_mutex)
> +{
> +struct dp_netdev_port *port;
> +struct rr_numa_list rr;
> +
> +rr_numa_list_populate(dp, );
> +
> +HMAP_FOR_EACH(port, node, >ports) {
> +struct rr_numa *numa;
> +int numa_id;
> +
> +if (!netdev_is_pmd(port->netdev)) {
> +continue;
> +}
> +
> +numa_id = netdev_get_numa_id(port->netdev);
> +numa = rr_numa_list_lookup(, numa_id);
> +
> +for (int qid = 0; qid < port->n_rxq; qid++) {
> +struct dp_netdev_rxq *q = >rxqs[qid];
> +
> +if (pinned && q->core_id != RXQ_CORE_UNPINNED) {
> +struct dp_netdev_pmd_thread *pmd;
> +
> +pmd = dp_netdev_get_pmd(dp, q->core_id);
> +if (!pmd) {
> +VLOG_WARN("There is no PMD thread on core %d. Queue "
> +  "%d on port \'%s\' will not be polled.",
> +  q->core_id, qid, 
> netdev_get_name(port->netdev));
> +} else {
> +q->pmd = pmd;
> +pmd->isolated = true;
> +dp_netdev_pmd_unref(pmd);
> +}
> +} else if (!pinned && q->core_id == RXQ_CORE_UNPINNED) {
> +if (!numa) {
> +VLOG_WARN("There's no available (non isolated) pmd 
> thread"
> +  "on numa node %d. Queue %d on port \'%s\' will"
> +  "not be polled.",

Spaces lost while splitting the string.

> +  numa_id, qid, netdev_get_name(port->netdev));
> +} else {
> +q->pmd = rr_numa_get_pmd(numa);
> +}
> +}
> +}
> +}
> +
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] tests/bundle: Use active_backup algorithm for up/down test.

2016-12-06 Thread Ilya Maximets
HRW algorithm uses hash function which is dependent from the build
environment. This leads to constant fails of the testsuite
with CFLAGS='-march=native' if CPU supports hashing instructions:

[---]
 ./bundle.at:233: ovs-appctl ofproto/trace br0 \
 'in_port=LOCAL,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:06'
 stdout:
 Bridge: br0
 Flow: in_port=LOCAL,vlan_tci=0x,dl_src=50:54:00:00:00:05,\
   dl_dst=50:54:00:00:00:06,dl_type=0x

 Rule: table=0 cookie=0
 OpenFlow actions=bundle(eth_src,50,hrw,ofport,slaves:1,2)

 Final flow: unchanged
 Megaflow: recirc_id=0,in_port=LOCAL,dl_src=50:54:00:00:00:05, \
   dl_type=0x
 Datapath actions: 2
 ./bundle.at:234: tail -1 stdout
 --- -
 +++ /testsuite.dir/at-groups/85/stdout
 @@ -1,2 +1,2 @@
 -Datapath actions: 1
 +Datapath actions: 2

[---]

Using of 'active_backup' algorithm will help to avoid such issues.

CC: Thadeu Lima de Souza Cascardo <casca...@redhat.com>
Fixes: 63460a30c53e ("tests/bundle: test bundle action with ports up and down")
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 tests/bundle.at | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/bundle.at b/tests/bundle.at
index 11c9713..0b6a192 100644
--- a/tests/bundle.at
+++ b/tests/bundle.at
@@ -227,7 +227,7 @@ OVS_VSWITCHD_START([dnl
 add-port br0 p2 -- set Interface p2 type=dummy -- \
 set Interface p2 ofport_request=2
 ])
-AT_CHECK([ovs-ofctl add-flow br0 
'actions=bundle(eth_src,50,hrw,ofport,slaves:1,2)'])
+AT_CHECK([ovs-ofctl add-flow br0 
'actions=bundle(eth_src,50,active_backup,ofport,slaves:1,2)'])
 AT_CHECK([ovs-ofctl mod-port br0 p1 up])
 AT_CHECK([ovs-ofctl mod-port br0 p2 up])
 AT_CHECK([ovs-appctl ofproto/trace br0 
'in_port=LOCAL,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:06'], [0], 
[stdout])
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] netdev-dpdk: Use intermediate queue during packet transmission.

2016-12-20 Thread Ilya Maximets
On 19.12.2016 21:05, Bodireddy, Bhanuprakash wrote:
> Thanks Ilya and Aaron for reviewing this patch and providing your comments, 
> my reply inline. 
> 
>> -Original Message-
>> From: Ilya Maximets [mailto:i.maxim...@samsung.com]
>> Sent: Monday, December 19, 2016 8:41 AM
>> To: Aaron Conole <acon...@redhat.com>; Bodireddy, Bhanuprakash
>> <bhanuprakash.bodire...@intel.com>
>> Cc: d...@openvswitch.org; Daniele Di Proietto <diproiet...@vmware.com>;
>> Thadeu Lima de Souza Cascardo <casca...@redhat.com>; Fischetti, Antonio
>> <antonio.fische...@intel.com>; Markus Magnusson
>> <markus.magnus...@ericsson.com>
>> Subject: Re: [ovs-dev] [PATCH] netdev-dpdk: Use intermediate queue during
>> packet transmission.
>>
>> Hi,
>>
>> I didn't test this patch yet. Bhanu, could you please describe your test 
>> scenario
>> and performance results in more details.
> 
> During the recent performance analysis improvements for classifier, we found 
> that bottleneck was also observed at flow batching.
> This was due to fewer packets in batch. To reproduce this, a simple P2P test 
> case can be used with 30 IXIA streams and matching IP flow rules.
> Eg:  For an IXIA stream with src Ip: 2.2.2.1, dst tip: 5.5.5.1 
> ovs-ofctl add-flow br0 dl_type=0x0800,nw_src=2.2.2.1,actions=output:2
> 
> For an IXIA stream with src Ip: 4.4.4.1, dst tip: 15.15.15.1
> ovs-ofctl add-flow br0 dl_type=0x0800,nw_src=4.4.4.1,actions=output:2
> 
> This leaves fewer packets in batches and packet_batch_per_flow_execute() 
> shall be invoked for every batch. 
> With 30 flows, I see the throughput drops to ~7.0 Mpps in PHY2PHY case for 64 
> byte udp packets. 
> 
>> It'll be nice if you provide throughput and latency measurement results for
>> different scenarios and packet sizes. Latency is important here.
> We are yet to do latency measurements in this case. With 30 IXIA streams 
> comprising of 64 byte udp packets there was
> an throughput improvement of 30% in P2P case and 13-15% in PVP case(single 
> queue). we will try to get the latency stats
> with and without this patch.
> 
>>
>> About the patch itself:
>>
>>  1. 'drain' called only for PMD threads. This will lead to
>> broken connection with non-PMD ports.
> I see that non-PMD ports are handled with vswitchd thread. 
> Tested PVP loopback case with tap ports and found to be working as expected. 
> Can you let me know the specific case
> you are referring here so that I can verify if the patch breaks it.

I meant something like this:


 *---HOST-1(br-int)-*   *-HOST-2(br-int)--*
 |  |   | |
 |   internal_port <--> dpdk0 <---> dpdk0 <--> internal_port  |
 |   192.168.0.1/24 |   |  192.168.0.2/24 |
 *--*   *-*

 (HOST-1)# ping 192.168.0.2

In this case I'm expecting that first (NETDEV_MAX_BURST - 1) icmp packets will
stuck in TX queue. The next packet will cause sending the whole batch of
NETDEV_MAX_BURST icmp packets. That is not good behaviour.

>>  2. 'xps_get_tx_qid()' called twice. First time on send and
>> the second time on drain. This may lead to different
>> returned 'tx_qid's and packets will stuck forever in
>> tx buffer.
> 
> You are right and xps_get_tx_qid() can return different tx_qids.
>  I was always testing for XPS disabled cases and completely overlooked this 
> case.
> This could be a potential problem for us and any suggestions should be very 
> helpful.
> 
>>
>>  3. 'txq_drain()' must take the 'tx_lock' for queue in case
>> of dynamic tx queues.
> 
> Agree, will handle this in next version. 
> 
>>
>>  4. Waiting for 1024 polling cycles of PMD thread may cause
>> a huge latency if we have few packets per second on one
>> port and intensive traffic on others.
> 
> I agree with you. We discussed this here and thought invoking the drain
> logic once every 1024 cycles is an optimal solution. But if the community 
> thinks otherwise
> we can move this in to the main 'for' loop so that it can be invoked more 
> often provided that
> 'DRAIN_TSC' cycles elapsed. 
> 
>>
>>  5. This patch breaks the counting of 'tx_dropped' packets
>> in netdev-dpdk.
> 
> I presume you are referring to below line in the code. 
> 
> - dropped += netdev_dpdk_eth_tx_burst() 
> + netdev_dpdk_eth_tx_queue()
> 
> Will handle this in v2 of this patch.
> 
>>
>>  6. Comments i

Re: [ovs-dev] [PATCH] netdev-dpdk: Use intermediate queue during packet transmission.

2016-12-19 Thread Ilya Maximets
Hi,

I didn't test this patch yet. Bhanu, could you please describe
your test scenario and performance results in more details.
It'll be nice if you provide throughput and latency measurement
results for different scenarios and packet sizes. Latency is
important here.

About the patch itself:

  1. 'drain' called only for PMD threads. This will lead to
 broken connection with non-PMD ports.

  2. 'xps_get_tx_qid()' called twice. First time on send and
 the second time on drain. This may lead to different
 returned 'tx_qid's and packets will stuck forever in
 tx buffer.

  3. 'txq_drain()' must take the 'tx_lock' for queue in case
 of dynamic tx queues.

  4. Waiting for 1024 polling cycles of PMD thread may cause
 a huge latency if we have few packets per second on one
 port and intensive traffic on others.

  5. This patch breaks the counting of 'tx_dropped' packets
 in netdev-dpdk.

  6. Comments in netdev-provider.h should be fixed to reflect
 all the changes.

  7. At last, I agree with Aaron that explicit allowing only
 'dpdk' ports is not a good style. Also, mentioning name of
 exact netdev inside the common code is a bad style too.

Best regards, Ilya Maximets.

On 16.12.2016 22:24, Aaron Conole wrote:
> Hi Bhanu,
> 
> Bhanuprakash Bodireddy <bhanuprakash.bodire...@intel.com> writes:
> 
>> In exact match cache processing on an EMC hit, packets are queued in to
>> batches matching the flow. Thereafter, packets are processed in batches
>> for faster packet processing. This particularly is inefficient if there
>> are fewer packets in a batch as rte_eth_tx_burst() incurs expensive MMIO
>> write.
>>
>> This commit adds back intermediate queue implementation. Packets are
>> queued and burst when the packet count >= NETDEV_MAX_BURST. Also drain
>> logic is refactored to handle fewer packets in the tx queues. Testing
>> shows significant performance gains with queueing.
>>
>> Fixes: b59cc14e032d("netdev-dpdk: Use instant sending instead of
>> queueing of packets.")
>> Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodire...@intel.com>
>> Signed-off-by: Antonio Fischetti <antonio.fische...@intel.com>
>> Co-authored-by: Antonio Fischetti <antonio.fische...@intel.com>
>> Signed-off-by: Markus Magnusson <markus.magnus...@ericsson.com>
>> Co-authored-by: Markus Magnusson <markus.magnus...@ericsson.com>
>> ---
> 
> I've Cc'd Ilya just in hopes that the patch gets a better review than I
> could give.  As a general comment, I like the direction - batched
> operations are usually a better way of going.
> 
> Just minor below.
> 
> ... snip ...
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 3509493..65dff83 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -622,6 +622,9 @@ static int dpif_netdev_xps_get_tx_qid(const struct 
>> dp_netdev_pmd_thread *pmd,
>>  static inline bool emc_entry_alive(struct emc_entry *ce);
>>  static void emc_clear_entry(struct emc_entry *ce);
>>  
>> +static struct tx_port *pmd_send_port_cache_lookup
>> +(const struct dp_netdev_pmd_thread *pmd, odp_port_t port_no);
>> +
>>  static void
>>  emc_cache_init(struct emc_cache *flow_cache)
>>  {
>> @@ -2877,6 +2880,31 @@ cycles_count_end(struct dp_netdev_pmd_thread *pmd,
>>  }
>>  
>>  static void
>> +dp_netdev_drain_txq_port(struct dp_netdev_pmd_thread *pmd,
>> + struct dp_netdev_port *port,
>> + uint64_t now)
>> +{
>> +int tx_qid;
>> +
>> +if (!strcmp(port->type, "dpdk")) {
> 
> Any reason to restrict this only to dpdk ports?  It looks like you've
> added a new netdev operation, so why not just call the netdev_txq_drain
> unconditionally?
> 
> Also, bit of a nit, but tq_qid can be reduced in scope down to the if
> block below.
> 
>> +struct tx_port *tx = pmd_send_port_cache_lookup(pmd,
>> + u32_to_odp(port->port_no));
>> +
>> +if (OVS_LIKELY(tx)) {
>> +bool dynamic_txqs = tx->port->dynamic_txqs;
>> +
>> +if (dynamic_txqs) {
>> +tx_qid = dpif_netdev_xps_get_tx_qid(pmd, tx, now);
>> +} else {
>> +tx_qid = pmd->static_tx_qid;
>> +}
>> +
>> +netdev_txq_drain(port->netdev, tx_qid);
>> +}
>> +}
>> +}
>> +
> 
> 
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] netdev-dpdk: Use intermediate queue during packet transmission.

2016-12-20 Thread Ilya Maximets
On 20.12.2016 15:19, Bodireddy, Bhanuprakash wrote:
>> -Original Message-
>> From: Ilya Maximets [mailto:i.maxim...@samsung.com]
>> Sent: Tuesday, December 20, 2016 8:09 AM
>> To: Bodireddy, Bhanuprakash <bhanuprakash.bodire...@intel.com>; Aaron
>> Conole <acon...@redhat.com>
>> Cc: d...@openvswitch.org; Daniele Di Proietto <diproiet...@vmware.com>;
>> Thadeu Lima de Souza Cascardo <casca...@redhat.com>; Fischetti, Antonio
>> <antonio.fische...@intel.com>; Markus Magnusson
>> <markus.magnus...@ericsson.com>
>> Subject: Re: [ovs-dev] [PATCH] netdev-dpdk: Use intermediate queue during
>> packet transmission.
>>
>> On 19.12.2016 21:05, Bodireddy, Bhanuprakash wrote:
>>> Thanks Ilya and Aaron for reviewing this patch and providing your
>> comments, my reply inline.
>>>
>>>> -Original Message-
>>>> From: Ilya Maximets [mailto:i.maxim...@samsung.com]
>>>> Sent: Monday, December 19, 2016 8:41 AM
>>>> To: Aaron Conole <acon...@redhat.com>; Bodireddy, Bhanuprakash
>>>> <bhanuprakash.bodire...@intel.com>
>>>> Cc: d...@openvswitch.org; Daniele Di Proietto
>>>> <diproiet...@vmware.com>; Thadeu Lima de Souza Cascardo
>>>> <casca...@redhat.com>; Fischetti, Antonio
>>>> <antonio.fische...@intel.com>; Markus Magnusson
>>>> <markus.magnus...@ericsson.com>
>>>> Subject: Re: [ovs-dev] [PATCH] netdev-dpdk: Use intermediate queue
>>>> during packet transmission.
>>>>
>>>> Hi,
>>>>
>>>> I didn't test this patch yet. Bhanu, could you please describe your
>>>> test scenario and performance results in more details.
>>>
>>> During the recent performance analysis improvements for classifier, we
>> found that bottleneck was also observed at flow batching.
>>> This was due to fewer packets in batch. To reproduce this, a simple P2P test
>> case can be used with 30 IXIA streams and matching IP flow rules.
>>> Eg:  For an IXIA stream with src Ip: 2.2.2.1, dst tip: 5.5.5.1
>>> ovs-ofctl add-flow br0
>>> dl_type=0x0800,nw_src=2.2.2.1,actions=output:2
>>>
>>> For an IXIA stream with src Ip: 4.4.4.1, dst tip: 15.15.15.1
>>> ovs-ofctl add-flow br0
>>> dl_type=0x0800,nw_src=4.4.4.1,actions=output:2
>>>
>>> This leaves fewer packets in batches and packet_batch_per_flow_execute()
>> shall be invoked for every batch.
>>> With 30 flows, I see the throughput drops to ~7.0 Mpps in PHY2PHY case for
>> 64 byte udp packets.
>>>
>>>> It'll be nice if you provide throughput and latency measurement
>>>> results for different scenarios and packet sizes. Latency is important 
>>>> here.
>>> We are yet to do latency measurements in this case. With 30 IXIA
>>> streams comprising of 64 byte udp packets there was an throughput
>>> improvement of 30% in P2P case and 13-15% in PVP case(single queue). we
>> will try to get the latency stats with and without this patch.
>>>
>>>>
>>>> About the patch itself:
>>>>
>>>>  1. 'drain' called only for PMD threads. This will lead to
>>>> broken connection with non-PMD ports.
>>> I see that non-PMD ports are handled with vswitchd thread.
>>> Tested PVP loopback case with tap ports and found to be working as
>>> expected. Can you let me know the specific case you are referring here so
>> that I can verify if the patch breaks it.
>>
>> I meant something like this:
>>
>>
>> *---HOST-1(br-int)-*   
>> *-HOST-2(br-int)--*
>> |  |   | 
>> |
>> |   internal_port <--> dpdk0 <---> dpdk0 <--> internal_port  
>> |
>> |   192.168.0.1/24 |   |  192.168.0.2/24 
>> |
>> *--*   
>> *-*
>>
>> (HOST-1)# ping 192.168.0.2
>>
>> In this case I'm expecting that first (NETDEV_MAX_BURST - 1) icmp packets
>> will stuck in TX queue. The next packet will cause sending the whole batch of
>> NETDEV_MAX_BURST icmp packets. That is not good behaviour.
> 
> Thanks for test case. I tested this and found to be working with the patch. 
> The reason being, PMD threads uses intermediate queue implementation by 
> invoking 'netdev_dpdk_eth_tx_queue()', 
> Wher

[ovs-dev] sphinx on os x on travis?

2017-03-20 Thread Ilya Maximets
Hi Ben and Lance,

> Based on the above, I think the issue is that sphinx 1.1 isn't actually 
> sufficient
> to build the documentation.

That is not true.

I made some investigation and found that root cause of this issue
is the version of python-docutils library.

Described issue is a know bug of python-docutils library that was
introduced in version 0.8 and fixed in 0.9 [1].
This library is used by sphinx for rST parsing. 

Bug in their bugtracker: https://sourceforge.net/p/docutils/bugs/182/

Travis uses Ubuntu 12.04 LTS for testing, but it contains[2] buggy
python-docutils (0.8.1-4ubuntu1). It's too old.

Possible solutions to make Travis happy:

* Update python-docutils to at least version 0.9

* Use a workaround for buggy docutils like one
  suggested in bugtracker:

Always use a blank line before the content, i.e.
"""
.. note::

Some text
"""
instead of
"""
.. note::
Some text
"""

Looks like there was no documentation builds at all before
lowering the minimal sphinx version, because Ubuntu 12.04
contains sphinx-1.1.3.

My RHEL7 installation has python-sphinx-1.1.3 and python-docutils-0.11
with no build issues.

[1] http://docutils.sourceforge.net/HISTORY.html
[2] http://packages.ubuntu.com/precise/python-docutils

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 3/3] netdev-dpdk: Use uint8_t for port_id.

2017-04-03 Thread Ilya Maximets
Currently, signed integer is used for 'port_id' variable and
'-1' as identifier of bad or uninitialized 'port_id'.

This inconsistent with dpdk library and, also, in few cases,
leads to passing '-1' to dpdk functions where uint8_t expected.

Such behaviour doesn't produce any issues, but it's better to
use same type as in dpdk library for consistency.

Also, magic number '-1' replaced with DPDK_ETH_PORT_ID_INVALID
macro.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/netdev-dpdk.c | 61 +++
 1 file changed, 35 insertions(+), 26 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 658a454..216ced8 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -140,6 +140,8 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF / 
ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
 #define OVS_VHOST_QUEUE_DISABLED(-2) /* Queue was disabled by guest and not
   * yet mapped to another queue. */
 
+#define DPDK_ETH_PORT_ID_INVALIDRTE_MAX_ETHPORTS
+
 #define VHOST_ENQ_RETRY_NUM 8
 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
 
@@ -309,7 +311,7 @@ struct dpdk_ring {
 struct rte_ring *cring_tx;
 struct rte_ring *cring_rx;
 unsigned int user_port_id; /* User given port no, parsed from port name */
-int eth_port_id; /* ethernet device port id */
+uint8_t eth_port_id; /* ethernet device port id */
 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
 };
 
@@ -325,7 +327,7 @@ enum dpdk_hw_ol_features {
 
 struct netdev_dpdk {
 struct netdev up;
-int port_id;
+uint8_t port_id;
 int max_packet_len;
 enum dpdk_dev_type type;
 
@@ -402,7 +404,7 @@ struct netdev_dpdk {
 
 struct netdev_rxq_dpdk {
 struct netdev_rxq up;
-int port_id;
+uint8_t port_id;
 };
 
 static int netdev_dpdk_class_init(void);
@@ -602,12 +604,12 @@ check_link_status(struct netdev_dpdk *dev)
 dev->link_reset_cnt++;
 dev->link = link;
 if (dev->link.link_status) {
-VLOG_DBG_RL(, "Port %d Link Up - speed %u Mbps - %s",
+VLOG_DBG_RL(, "Port %"PRIu8" Link Up - speed %u Mbps - %s",
 dev->port_id, (unsigned) dev->link.link_speed,
 (dev->link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
  ("full-duplex") : ("half-duplex"));
 } else {
-VLOG_DBG_RL(, "Port %d Link Down", dev->port_id);
+VLOG_DBG_RL(, "Port %"PRIu8" Link Down", dev->port_id);
 }
 }
 }
@@ -725,8 +727,8 @@ dpdk_eth_checksum_offload_configure(struct netdev_dpdk *dev)
 if (rx_csum_ol_flag &&
 (info.rx_offload_capa & rx_chksm_offload_capa) !=
  rx_chksm_offload_capa) {
-VLOG_WARN_ONCE("Rx checksum offload is not supported on device %d",
-   dev->port_id);
+VLOG_WARN_ONCE("Rx checksum offload is not supported on device %"PRIu8,
+   dev->port_id);
 dev->hw_ol_features &= ~NETDEV_RX_CHECKSUM_OFFLOAD;
 return;
 }
@@ -737,7 +739,8 @@ static void
 dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
 {
 if (rte_eth_dev_flow_ctrl_set(dev->port_id, >fc_conf)) {
-VLOG_WARN("Failed to enable flow control on device %d", dev->port_id);
+VLOG_WARN("Failed to enable flow control on device %"PRIu8,
+  dev->port_id);
 }
 }
 
@@ -775,7 +778,7 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
 
 memset(_addr, 0x0, sizeof(eth_addr));
 rte_eth_macaddr_get(dev->port_id, _addr);
-VLOG_INFO_RL(, "Port %d: "ETH_ADDR_FMT,
+VLOG_INFO_RL(, "Port %"PRIu8": "ETH_ADDR_FMT,
 dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));
 
 memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
@@ -787,7 +790,7 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
 /* Get the Flow control configuration for DPDK-ETH */
 diag = rte_eth_dev_flow_ctrl_get(dev->port_id, >fc_conf);
 if (diag) {
-VLOG_DBG("cannot get flow control parameters on port=%d, err=%d",
+VLOG_DBG("cannot get flow control parameters on port=%"PRIu8", err=%d",
  dev->port_id, diag);
 }
 
@@ -832,7 +835,7 @@ netdev_dpdk_alloc_txq(unsigned int n_txqs)
 }
 
 static int
-common_construct(struct netdev *netdev, unsigned int port_no,
+common_construct(struct netdev *netdev, uint8_t port_no,
  enum dpdk_dev_type type, int socket_id)
 OVS_REQUIRES(dpdk_mutex)
 {
@@ -917,7 +920,8 @@ vhost_common_construct(struct netdev *netdev)
 return ENOMEM;
 }
 
-return common_construct(netdev, -1, DPDK_DEV_VHOST, socket_id);
+retur

[ovs-dev] [PATCH 0/3] Hotplug fixes & port_id refactoring

2017-04-03 Thread Ilya Maximets

Ilya Maximets (3):
  netdev-dpdk: Fix double attaching of virtual devices.
  netdev-dpdk: Fix device leak on port deletion.
  netdev-dpdk: Use uint8_t for port_id.

 Documentation/howto/dpdk.rst |   5 +-
 lib/netdev-dpdk.c| 137 +++
 2 files changed, 62 insertions(+), 80 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 1/3] netdev-dpdk: Fix double attaching of virtual devices.

2017-04-03 Thread Ilya Maximets
'devargs' for virtual devices contains not only name but
also a list of arguments like this:

'net_pcap0,rx_pcap=file_rx.pcap,tx_pcap=file_tx.pcap'
or
'eth_af_packet0,iface=eth0'

We must cut off the arguments from this string before calling
'rte_eth_dev_get_port_by_name()' to avoid double attaching of
the same device.

CC: Ciara Loftus <ciara.lof...@intel.com>
Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)")
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/netdev-dpdk.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index ddc651b..c8d7108 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -1114,9 +1114,16 @@ static int
 netdev_dpdk_process_devargs(const char *devargs, char **errp)
 {
 uint8_t new_port_id = UINT8_MAX;
+char *ind, *name = xstrdup(devargs);
+
+/* Get the name from the comma separated list of arguments. */
+ind = index(name, ',');
+if (ind != NULL) {
+*ind = '\0';
+}
 
 if (!rte_eth_dev_count()
-|| rte_eth_dev_get_port_by_name(devargs, _port_id)
+|| rte_eth_dev_get_port_by_name(name, _port_id)
 || !rte_eth_dev_is_valid_port(new_port_id)) {
 /* Device not found in DPDK, attempt to attach it */
 if (!rte_eth_dev_attach(devargs, _port_id)) {
@@ -1129,6 +1136,7 @@ netdev_dpdk_process_devargs(const char *devargs, char 
**errp)
 }
 }
 
+free(name);
 return new_port_id;
 }
 
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 2/3] netdev-dpdk: Fix device leak on port deletion.

2017-04-03 Thread Ilya Maximets
Currently, once created device in dpdk will exist forever
even after del-port operation untill we manually call
'ovs-appctl netdev-dpdk/detach ', where  is not
the port's name but the name of dpdk eth device or pci address.

Few issues with current implementation:

1. Different API for usual (system) and DPDK devices.
   (We have to call 'ovs-appctl netdev-dpdk/detach' each
time after 'del-port' to actually free the device)
   This is a big issue mostly for virtual DPDK devices.

2. Follows from 1:
   For DPDK devices 'del-port' leads just to
   'rte_eth_dev_stop' and subsequent 'add-port' will
   just start the already existing device. Such behaviour
   will not reset the device to initial state as it could
   be expected. For example: virtual pcap pmd will continue
   reading input file instead of reading it from the beginning.

3. Follows from 2:
   After execution of the following commands 'port1' will be
   configured with the 'old-options' while 'ovs-vsctl show'
   will show us 'new-options' in dpdk-devargs field:

 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \
   options:dpdk-devargs=,
 ovs-vsctl del-port port1
 ovs-vsctl add-port port1 -- set interface port1 type=dpdk \
   options:dpdk-devargs=,

4. Follows from 1:
   Not detached device consumes 'port_id'. Since we have very
   limited number of 'port_id's (32 in common case) this may
   lead to quick exhausting of id pool and inability to add any
   other port.

To avoid above issues we need to detach all the attached devices on
port destruction.
appctl 'netdev-dpdk/detach' removed because not needed anymore.

CC: Ciara Loftus <ciara.lof...@intel.com>
Fixes: 55e075e65ef9 ("netdev-dpdk: Arbitrary 'dpdk' port naming")
Fixes: 69876ed78611 ("netdev-dpdk: Add support for virtual DPDK PMDs (vdevs)")
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 Documentation/howto/dpdk.rst |  5 ++--
 lib/netdev-dpdk.c| 66 +++-
 2 files changed, 18 insertions(+), 53 deletions(-)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index dc63f7d..20d8975 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -342,10 +342,9 @@ Then it can be attached to OVS::
 $ ovs-vsctl add-port br0 dpdkx -- set Interface dpdkx type=dpdk \
 options:dpdk-devargs=:01:00.0
 
-It is also possible to detach a port from ovs, the user has to remove the
-port using the del-port command, then it can be detached using::
+Detaching will be performed while processing del-port command::
 
-$ ovs-appctl netdev-dpdk/detach :01:00.0
+$ ovs-vsctl del-port dpdkx
 
 This feature is not supported with VFIO and does not work with some NICs.
 For more information please refer to the `DPDK Port Hotplug Framework
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index c8d7108..658a454 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -359,6 +359,9 @@ struct netdev_dpdk {
 /* Device arguments for dpdk ports */
 char *devargs;
 
+/* If true, device was attached by rte_eth_dev_attach(). */
+bool attached;
+
 /* In dpdk_list. */
 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
 
@@ -851,6 +854,7 @@ common_construct(struct netdev *netdev, unsigned int 
port_no,
 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
 ovsrcu_index_init(>vid, -1);
 dev->vhost_reconfigured = false;
+dev->attached = false;
 
 ovsrcu_init(>qos_conf, NULL);
 
@@ -996,10 +1000,21 @@ static void
 netdev_dpdk_destruct(struct netdev *netdev)
 {
 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+char devname[RTE_ETH_NAME_MAX_LEN];
 
 ovs_mutex_lock(_mutex);
 
 rte_eth_dev_stop(dev->port_id);
+
+if (dev->attached) {
+rte_eth_dev_close(dev->port_id);
+if (rte_eth_dev_detach(dev->port_id, devname) < 0) {
+VLOG_ERR("Device '%s' can not be detached", dev->devargs);
+} else {
+VLOG_INFO("Device '%s' detached", devname);
+}
+}
+
 free(dev->devargs);
 common_destruct(dev);
 
@@ -1128,6 +1143,7 @@ netdev_dpdk_process_devargs(const char *devargs, char 
**errp)
 /* Device not found in DPDK, attempt to attach it */
 if (!rte_eth_dev_attach(devargs, _port_id)) {
 /* Attach successful */
+dev->attached = true;
 VLOG_INFO("Device '%s' attached to DPDK", devargs);
 } else {
 /* Attach unsuccessful */
@@ -2436,53 +2452,6 @@ netdev_dpdk_set_admin_state(struct unixctl_conn *conn, 
int argc,
 unixctl_command_reply(conn, "OK");
 }
 
-static voi

[ovs-dev] Traffic fails in vhost user port

2017-04-04 Thread Ilya Maximets
Hi Sundar.

> Hi,
> I have an OVS bridge br0 with no NICs and  1 vhost user port which is 
> connected to a VM. But ping fails between the VM and the br0 port, either 
> way. The stats show zero all the time. Inside the VM, tcpdump shows nothing.
> 
> This is with OVS 2.7.0 and DPDK 17.02. Please indicate what could be going 
> wrong.
> 
> In the host, the bridge's internal port is up.
> # ip addr show br0
> 24: br0:  mtu 1500 qdisc noqueue state UNKNOWN 
> qlen 500
> link/ether 52:2f:57:13:d8:40 brd ff:ff:ff:ff:ff:ff
> inet 200.1.1.1/24 scope global br0
>valid_lft forever preferred_lft forever
> 
> In the VM, the eth interface is up with address 200.1.1.2/24.
> 
> The ports are in the following state, even after "ovs-ofctl mod-port br0 vi1 
> up":
> # ovs-ofctl dump-ports-desc br0
> OFPST_PORT_DESC reply (xid=0x2):
> 5(vi1): addr:00:00:00:00:00:00
>  config: 0
>  state:  0
>  speed: 0 Mbps now, 0 Mbps max
> LOCAL(br0): addr:52:2f:57:13:d8:40
>  config: 0
>  state:  0
>  current:10MB-FD COPPER
>  speed: 10 Mbps now, 0 Mbps max
> 
> The flows are configured as below:
> # ovs-ofctl dump-flows br0
> NXST_FLOW reply (xid=0x4):
> cookie=0x0, duration=2833.612s, table=0, n_packets=0, n_bytes=0, 
> idle_age=2833, in_port=1 actions=output:5
> cookie=0x2, duration=2819.820s, table=0, n_packets=0, n_bytes=0, 
> idle_age=2819, in_port=5 actions=output:1

I guess, your flow table configured in a wrong way.
OpenFlow port of br0 is LOCAL, not 1.
Try this:

# ovs-ofctl del-flows br0

# ovs-ofctl add-flow br0 in_port=5,actions=output:LOCAL
# ovs-ofctl add-flow br0 in_port=LOCAL,actions=output:5

or

# ovs-ofctl add-flow br0 actions=NORMAL

> 
> The table info is as below:
> # ovs-ofctl dump-tables br0 | more
> OFPST_TABLE reply (xid=0x2):
>   table 0 ("classifier"):
> active=2, lookup=37, matched=28
> max_entries=100
> matching:
>   in_port: exact match or wildcard
>   eth_src: exact match or wildcard
>   eth_dst: exact match or wildcard
>   eth_type: exact match or wildcard
>   vlan_vid: exact match or wildcard
>   vlan_pcp: exact match or wildcard
>   ip_src: exact match or wildcard
>   ip_dst: exact match or wildcard
>   nw_proto: exact match or wildcard
>   nw_tos: exact match or wildcard
>   tcp_src: exact match or wildcard
>   tcp_dst: exact match or wildcard
> 
>   table 1 ("table1"):
>active=0, lookup=0, matched=0
> (same features)
> 
> Thanks,
> Sundar
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] Traffic fails in vhost user port

2017-04-04 Thread Ilya Maximets
On 04.04.2017 12:26, Nadathur, Sundar wrote:
> Thanks, Ilya. 
> 
> # ovs-vsctl list Interface vi1
> _uuid   : 30d1600a-ff7d-4bf5-9fdb-b0767af3611c
> admin_state : up
> bfd : {}
> bfd_status  : {}
> cfm_fault   : []
> cfm_fault_status: []
> cfm_flap_count  : []
> cfm_health  : []
> cfm_mpid: []
> cfm_remote_mpids: []
> cfm_remote_opstate  : []
> duplex  : []
> error   : []
> external_ids: {}
> ifindex : 0
> ingress_policing_burst: 0
> ingress_policing_rate: 0
> lacp_current: []
> link_resets : 0
> link_speed  : []
> link_state  : up
> lldp: {}
> mac : []
> mac_in_use  : "00:00:00:00:00:00"
> mtu : 1500
> mtu_request : []
> name: "vi1"
> ofport  : 5
> ofport_request  : []
> options : {}
> other_config: {}
> statistics  : {"rx_1024_to_1518_packets"=0, 
> "rx_128_to_255_packets"=0, "rx_1523_to_max_packets"=0, 
> "rx_1_to_64_packets"=0, "rx_256_to_511_packets"=0, 
> "rx_512_to_1023_packets"=0, "rx_65_to_127_packets"=0, rx_bytes=0, 
> rx_dropped=0, rx_errors=0, tx_bytes=0, tx_dropped=11}
> status  : {}
> type: dpdkvhostuser
> 
> Here is the qemu command line split for readability:
> /usr/libexec/qemu-kvm -name guest=vhu-vm1,debug-threads=on -S 
>-object 
> secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-3-vhu-vm1/master-key.aes
>  -machine pc-i440fx-rhel7.3.0,accel=kvm,usb=off 
>-m 2048 -mem-prealloc -mem-path /dev/hugepages/libvirt/qemu -realtime 
> mlock=off -smp 2,sockets=2,cores=1,threads=1 
>-uuid f5b8c05b-9c7a-3211-49b9-2bd635f7e2aa -no-user-config -nodefaults 
>-chardev 
> socket,id=charmonitor,path=/var/lib/libvirt/qemu/domain-3-vhu-vm1/monitor.sock,server,nowait
>-mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc 
> -no-shutdown -boot strict=on -device 
> piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2
>-drive 
> file=/home/nfv/Images/vm1.qcow2,format=qcow2,if=none,id=drive-virtio-disk0 
> -device 
> virtio-blk-pci,scsi=off,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
>  
> -chardev socket,id=charnet0,path=/usr/local/var/run/openvswitch/vi1 
> -netdev vhost-user,chardev=charnet0,id=hostnet0 
>-device 
> virtio-net-pci,netdev=hostnet0,id=net0,mac=3a:19:09:52:14:50,bus=pci.0,addr=0x3
>  -vnc 0.0.0.0:1 
>-device cirrus-vga,id=video0,bus=pci.0,addr=0x2 -device 
> virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -msg timestamp=on
> 

OK. I got it. Memory is not shared between OVS and VM.
To make vhostuser work you must use 'share' option for qemu memory backing.

Please, refer the Documentation/topics/dpdk/vhost-user.rst for libvirt xml
example.  "memAccess='shared'" - is what you need.

QEMU cmdline should contain something like this:
-object 
memory-backend-file,id=ram-node0,prealloc=yes,mem-path=/dev/hugepages/libvirt/qemu,share=yes,size=10737418240,host-nodes=0,policy=bind
Maybe you can avoid using hugepages, but 'share=yes' is required for vhost-user 
to work.

Best regards, Ilya Maximets.



> Re. ifconfig from VM, I have difficulty getting it right now over VPN, but I 
> will get it by tomorrow morning. The 'ifconfig ' state is UP in the VM, IP 
> address is configured as 200.1.1.2/24 in the virtio-net interface in the VM. 
> Within the VM, the local address 200.1.1.2 can be pinged. 
> 
> Is there any good way to monitor packets flowing over vhost-user interface, 
> such as wireshark for eth interfaces? 
> 
> 
> Regards,
> Sundar
> 
>> -Original Message-----
>> From: Ilya Maximets [mailto:i.maxim...@samsung.com]
>> Sent: Tuesday, April 4, 2017 2:13 AM
>> To: Nadathur, Sundar <sundar.nadat...@intel.com>; ovs-
>> d...@openvswitch.org
>> Subject: Re: [ovs-dev] Traffic fails in vhost user port
>>
>> On 04.04.2017 11:29, Nadathur, Sundar wrote:
>>>> -Original Message-
>>>> From: Ilya Maximets [mailto:i.maxim...@samsung.com]
>>>> Sent: Tuesday, April 4, 2017 12:07 AM
>>>> To: ovs-dev@openvswitch.org; Nadathur, Sundar
>>>> <sundar.nadat...@intel.com>
>>>> Subject: [ovs-dev] Traffic fails in vhost user port
>>>>
>>>> Hi Sundar.
>>>
>>>>> The flows are configured as below:
>>>>> # ovs-ofctl dump-flows br0
>>>>> NXST_FLOW reply (xid=0x4):
>>>>> co

Re: [ovs-dev] Traffic fails in vhost user port

2017-04-04 Thread Ilya Maximets
On 04.04.2017 11:29, Nadathur, Sundar wrote:
>> -Original Message-
>> From: Ilya Maximets [mailto:i.maxim...@samsung.com]
>> Sent: Tuesday, April 4, 2017 12:07 AM
>> To: ovs-dev@openvswitch.org; Nadathur, Sundar
>> <sundar.nadat...@intel.com>
>> Subject: [ovs-dev] Traffic fails in vhost user port
>>
>> Hi Sundar.
> 
>>> The flows are configured as below:
>>> # ovs-ofctl dump-flows br0
>>> NXST_FLOW reply (xid=0x4):
>>> cookie=0x0, duration=2833.612s, table=0, n_packets=0, n_bytes=0,
>>> idle_age=2833, in_port=1 actions=output:5 cookie=0x2,
>>> duration=2819.820s, table=0, n_packets=0, n_bytes=0, idle_age=2819,
>>> in_port=5 actions=output:1
>>
>> I guess, your flow table configured in a wrong way.
>> OpenFlow port of br0 is LOCAL, not 1.
>> Try this:
>>
>> # ovs-ofctl del-flows br0
>>
>> # ovs-ofctl add-flow br0 in_port=5,actions=output:LOCAL # ovs-ofctl add-flow
>> br0 in_port=LOCAL,actions=output:5
> 
> Thank you, Ilya. I did as you suggested, but the ping traffic from br0 
> (LOCAL) is dropped by the output port 5:
> # ovs-ofctl dump-flows br0
> NXST_FLOW reply (xid=0x4):
>  cookie=0x0, duration=1922.876s, table=0, n_packets=0, n_bytes=0, 
> idle_age=1922, in_port=5 actions=LOCAL
>  cookie=0x0, duration=1915.458s, table=0, n_packets=6, n_bytes=252, 
> idle_age=116, in_port=LOCAL actions=output:5
> 
> # ovs-ofctl dump-ports br0 # <-- Drops in port 5
> OFPST_PORT reply (xid=0x2): 2 ports
>   port  5: rx pkts=?, bytes=0, drop=0, errs=0, frame=?, over=?, crc=?
>tx pkts=?, bytes=0, drop=5, errs=?, coll=?
>   port LOCAL: rx pkts=43, bytes=2118, drop=0, errs=0, frame=0, over=0, crc=0
>tx pkts=0, bytes=0, drop=0, errs=0, coll=0
> 
> Wireshark shows that br0 sends out 3 ARP requests but there is no response. 
> 
>> or
>>
>> # ovs-ofctl add-flow br0 actions=NORMAL
> I tried this too after doing del-flows. The LOCAL port's MAC is learnt, 
> wireshark still shows br0 sending out ARP requests with no response. 
> 
> BTW, 'ovs-vsctl list Interface' shows the vi1 (VM port, #5) is up (most 
> fields are blank):
> _uuid   : 30d1600a-ff7d-4bf5-9fdb-b0767af3611c
> admin_state : up
> . . .
> link_speed  : []
> link_state  : up
> . . .
> mac_in_use  : "00:00:00:00:00:00"
> mtu : 1500
> mtu_request : []
> name: "vi1"
> . . .
> statistics  : {"rx_1024_to_1518_packets"=0, 
> "rx_128_to_255_packets"=0, "rx_1523_to_max_packets"=0, 
> "rx_1_to_64_packets"=0, "rx_256_to_511_packets"=0, 
> "rx_512_to_1023_packets"=0, "rx_65_to_127_packets"=0, rx_bytes=0, 
> rx_dropped=0, rx_errors=0, tx_bytes=0, tx_dropped=8}
> status  : {}
> type    : dpdkvhostuser
> 
> Is there any way to do the equivalent of a tcpdump or wireshark on a vhost 
> user port?
> 
> Thanks,
> Sundar
> 
Blanc fields in 'list interface' is normal for vhostuser.

Looks like something wrong with VM.
Please, provide the output of 'ip a' or 'ifconfig -a' from VM and full output
of 'ovs-vsctl list Interface vi1'. Also, qemu cmdline or libvirt xml can be 
helpful.


Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] Documentation: Remove external dependence on pygments.

2017-03-10 Thread Ilya Maximets
Current documentation uses syntax highlighting in 'sphinx'
via 'pygments' library. This leads to build failures on the
systems with old version of this library.

In fact that only 'windows.rst' uses highlighting it's a
very simple change. This helps us to avoid build issues
on different systems and allows to remove painful external
dependency.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 Documentation/conf.py  |  3 -
 .../internals/contributing/documentation-style.rst | 10 ++-
 Documentation/intro/install/windows.rst| 88 +++---
 Documentation/topics/language-bindings.rst |  2 +-
 4 files changed, 51 insertions(+), 52 deletions(-)

diff --git a/Documentation/conf.py b/Documentation/conf.py
index 5909669..6a924b3 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -107,9 +107,6 @@ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 #
 # show_authors = False
 
-# The name of the Pygments (syntax highlighting) style to use.
-# pygments_style = 'friendly'
-
 # A list of ignored prefixes for module index sorting.
 # modindex_common_prefix = []
 
diff --git a/Documentation/internals/contributing/documentation-style.rst 
b/Documentation/internals/contributing/documentation-style.rst
index ea41a07..99eec69 100644
--- a/Documentation/internals/contributing/documentation-style.rst
+++ b/Documentation/internals/contributing/documentation-style.rst
@@ -115,9 +115,11 @@ Titles
 Code
 
 
-- Use ``::``, the ``code`` role or the ``code-block:: `` role to prefix
-  code. The ``code-block:: `` format is preferred as this provides
-  syntax highlighting for non-Python languages, such as Bash or PowerShell.
+- Use ``::`` to prefix code.
+
+- Don't use syntax highlighting such as ``.. highlight:: `` or
+  ``code-block:: `` because it depends on external ``pygments``
+  library.
 
 - Prefix commands with ``$``.
 
@@ -259,7 +261,7 @@ Figures and Other Media
 - All images should be in PNG format and compressed where possible. For PNG
   files, use OptiPNG and AdvanceCOMP's ``advpng``:
 
-  .. code-block:: shell
+  ::
 
  $ optipng -o7 -zm1-9 -i0 -strip all 
  $ advpng -z4 
diff --git a/Documentation/intro/install/windows.rst 
b/Documentation/intro/install/windows.rst
index caa9f40..2be4eb5 100644
--- a/Documentation/intro/install/windows.rst
+++ b/Documentation/intro/install/windows.rst
@@ -63,7 +63,7 @@ The following explains the steps in some detail.
   We require that you have Python six and pypiwin32 libraries installed.
   The libraries can be installed via pip command:
 
-   .. code-block:: console
+   ::
 
   $ pip install six
   $ pip install pypiwin32
@@ -140,7 +140,7 @@ you pulled the sources directly from an Open vSwitch Git 
tree or got a
 Git tree snapshot, then run boot.sh in the top source directory to build
 the "configure" script:
 
-.. code-block:: console
+::
 
$ ./boot.sh
 
@@ -153,7 +153,7 @@ Configure the package by running the configure script.  You 
should provide some
 configure options to choose the right compiler, linker, libraries, Open vSwitch
 component installation directories, etc. For example:
 
-.. code-block:: console
+::
 
$ ./configure CC=./build-aux/cccl LD="$(which link)" \
LIBS="-lws2_32 -liphlpapi -lwbemuuid -lole32 -loleaut32" \
@@ -169,7 +169,7 @@ component installation directories, etc. For example:
 
 To configure with SSL support, add the requisite additional options:
 
-.. code-block:: console
+::
 
$ ./configure CC=./build-aux/cccl LD="`which link`"  \
LIBS="-lws2_32 -liphlpapi -lwbemuuid -lole32 -loleaut32" \
@@ -181,7 +181,7 @@ To configure with SSL support, add the requisite additional 
options:
 
 Finally, to the kernel module also:
 
-.. code-block:: console
+::
 
$ ./configure CC=./build-aux/cccl LD="`which link`" \
LIBS="-lws2_32 -liphlpapi -lwbemuuid -lole32 -loleaut32" \
@@ -211,7 +211,7 @@ building on Linux, FreeBSD, or NetBSD.
 
 #. Run make for the ported executables in the top source directory, e.g.:
 
-   .. code-block:: console
+   ::
 
   $ make
 
@@ -225,25 +225,25 @@ building on Linux, FreeBSD, or NetBSD.
   all MinGW sessions and then run the below command from MSVC developers
   command prompt.:
 
-  .. code-block:: doscon
+  ::
 
  > mingw-get upgrade msys-core-bin=1.0.17-1
 
 #. To run all the unit tests in Open vSwitch, one at a time:
 
-   .. code-block:: console
+   ::
 
   $ make check
 
To run all the unit tests in Open vSwitch, up to 8 in parallel:
 
-   .. code-block:: console
+   ::
 
   $ make check TESTSUITEFLAGS="-j8"
 
 #. To install all the compiled executables on the local machine, run:
 
-   .. code-block:: console
+   ::
 
   $ make install
 
@@ -276,7 +276,7 @@ Now run ``./uninstall.cmd`` to remove the old extension. 
Once complete, run
 turn on ``TESTSIG

Re: [ovs-dev] Documentation: Report errors for use of features not in Sphinx 1.1.3.

2017-03-10 Thread Ilya Maximets
I've sent the patch for removing highlighting at all here:
https://mail.openvswitch.org/pipermail/ovs-dev/2017-March/329651.html

Only 'windows.rst' uses this functionality. So, I think, it's better
to just remove it and forbid the highlighting to avoid any issues
with external dependencies.

Best regards, Ilya Maximets.

On 10.03.2017 10:47, Ilya Maximets wrote:
> On 10.03.2017 02:27, Ben Pfaff wrote:
>> On Thu, Mar 09, 2017 at 06:15:13PM +0300, Ilya Maximets wrote:
>>> On 07.03.2017 21:54, Ben Pfaff wrote:
>>>> Signed-off-by: Ben Pfaff <b...@ovn.org>
>>>> Acked-by: Stephen Finucane <step...@that.guru>
>>>> ---
>>>>  Documentation/automake.mk  | 15 ++-
>>>>  Documentation/sphinx-version-blacklist |  2 ++
>>>>  2 files changed, 16 insertions(+), 1 deletion(-)
>>>>  create mode 100644 Documentation/sphinx-version-blacklist
>>>>
>>>> diff --git a/Documentation/automake.mk b/Documentation/automake.mk
>>>> index a74807fde532..f7f1fe61d1b7 100644
>>>> --- a/Documentation/automake.mk
>>>> +++ b/Documentation/automake.mk
>>>> @@ -86,7 +86,8 @@ EXTRA_DIST += \
>>>>Documentation/internals/contributing/documentation-style.rst \
>>>>Documentation/internals/contributing/libopenvswitch-abi.rst \
>>>>Documentation/internals/contributing/submitting-patches.rst \
>>>> -  Documentation/requirements.txt
>>>> +  Documentation/requirements.txt \
>>>> +  Documentation/sphinx-version-blacklist
>>>>  
>>>>  # You can set these variables from the command line.
>>>>  SPHINXOPTS =
>>>> @@ -120,3 +121,15 @@ endif
>>>>  .PHONY: htmldocs
>>>>  .PHONY: check-docs
>>>>  .PHONY: clean-docs
>>>> +
>>>> +ALL_LOCAL += sphinx-version-check
>>>> +sphinx-version-check: $(EXTRA_DIST)
>>>> +  @if grep -n -f $(srcdir)/Documentation/sphinx-version-blacklist $?; \
>>>> +  then \
>>>> +echo "See above for list of uses of features that Sphinx 1.1.3"; \
>>>> +echo "does not support.  Please avoid using these features.."; \
>>>> +exit 1; \
>>>> +  else \
>>>> +  : > $@; \
>>>> +  fi
>>>> +CLEANFILES += sphinx-version-check
>>>> diff --git a/Documentation/sphinx-version-blacklist 
>>>> b/Documentation/sphinx-version-blacklist
>>>> new file mode 100644
>>>> index ..a67339bf2758
>>>> --- /dev/null
>>>> +++ b/Documentation/sphinx-version-blacklist
>>>> @@ -0,0 +1,2 @@
>>>> +code-block:: *ps1con
>>>> +code-block:: *doscon
>>>
>>> I don't feel this patch is fully correct, because it's not the features of
>>> sphinx. And its version not really connected with version of 'pygments' 
>>> library.
>>
>> OK, can you explain the real problem then?  We're making changes to the
>> documentation on the basis that old versions of Sphinx does not support
>> features.
> 
> The real problem is the version of 'pygments' library. Sphinx uses this 
> library
> to highlight code blocks.
> So, RHEL7.3 contains package 'python-pygments-2.0.2', but lexers 'ps1con' and
> 'doscon' was introduced only in 'pygments-2.1'. That is why build fails.
> 
> '''
> class pygments.lexers.shell.MSDOSSessionLexer
> Short names:  doscon
> Filenames:None
> MIME types:   None
> 
> Lexer for simplistic MSDOS sessions.
> 
> New in version 2.1.
> 
> class pygments.lexers.shell.PowerShellSessionLexer
> Short names:  ps1con
> Filenames:None
> MIME types:   None
> 
> Lexer for simplistic Windows PowerShell sessions.
> 
> New in version 2.1.
> '''
> 
> On page [1] of 'pygments' project you can check the minimal version required
> for every lexer.
> 
> Maybe we need to add minimal version of 'pygments' to requirements.txt .
> In this case we will be able to create a whitelist of all supported lexers.
> 
> Another option:
> Do we need the code highlighting at all?
> We can just replace all the '.. code-block:: ' with simple '::' 
> [2].
> In this case, we will not have any external dependencies other than sphinx.
> 
> P.S. My previous patch [3] is just about ability to build documentation
>  with sphinx 1.1 because there is no any reason to block it.
> 
> [1] http://pygments.org/docs/lexers/
> [2] http://www.sphinx-doc.org/en/stable/rest.html#source-code
> [3] https://mail.openvswitch.org/pipermail/ovs-dev/2017-March/329590.html
> 
> Best regards, Ilya Maximets.
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] Documentation: Report errors for use of features not in Sphinx 1.1.3.

2017-03-09 Thread Ilya Maximets
On 10.03.2017 02:27, Ben Pfaff wrote:
> On Thu, Mar 09, 2017 at 06:15:13PM +0300, Ilya Maximets wrote:
>> On 07.03.2017 21:54, Ben Pfaff wrote:
>>> Signed-off-by: Ben Pfaff <b...@ovn.org>
>>> Acked-by: Stephen Finucane <step...@that.guru>
>>> ---
>>>  Documentation/automake.mk  | 15 ++-
>>>  Documentation/sphinx-version-blacklist |  2 ++
>>>  2 files changed, 16 insertions(+), 1 deletion(-)
>>>  create mode 100644 Documentation/sphinx-version-blacklist
>>>
>>> diff --git a/Documentation/automake.mk b/Documentation/automake.mk
>>> index a74807fde532..f7f1fe61d1b7 100644
>>> --- a/Documentation/automake.mk
>>> +++ b/Documentation/automake.mk
>>> @@ -86,7 +86,8 @@ EXTRA_DIST += \
>>> Documentation/internals/contributing/documentation-style.rst \
>>> Documentation/internals/contributing/libopenvswitch-abi.rst \
>>> Documentation/internals/contributing/submitting-patches.rst \
>>> -   Documentation/requirements.txt
>>> +   Documentation/requirements.txt \
>>> +   Documentation/sphinx-version-blacklist
>>>  
>>>  # You can set these variables from the command line.
>>>  SPHINXOPTS =
>>> @@ -120,3 +121,15 @@ endif
>>>  .PHONY: htmldocs
>>>  .PHONY: check-docs
>>>  .PHONY: clean-docs
>>> +
>>> +ALL_LOCAL += sphinx-version-check
>>> +sphinx-version-check: $(EXTRA_DIST)
>>> +   @if grep -n -f $(srcdir)/Documentation/sphinx-version-blacklist $?; \
>>> +   then \
>>> + echo "See above for list of uses of features that Sphinx 1.1.3"; \
>>> + echo "does not support.  Please avoid using these features.."; \
>>> + exit 1; \
>>> +   else \
>>> +  : > $@; \
>>> +   fi
>>> +CLEANFILES += sphinx-version-check
>>> diff --git a/Documentation/sphinx-version-blacklist 
>>> b/Documentation/sphinx-version-blacklist
>>> new file mode 100644
>>> index ..a67339bf2758
>>> --- /dev/null
>>> +++ b/Documentation/sphinx-version-blacklist
>>> @@ -0,0 +1,2 @@
>>> +code-block:: *ps1con
>>> +code-block:: *doscon
>>
>> I don't feel this patch is fully correct, because it's not the features of
>> sphinx. And its version not really connected with version of 'pygments' 
>> library.
> 
> OK, can you explain the real problem then?  We're making changes to the
> documentation on the basis that old versions of Sphinx does not support
> features.

The real problem is the version of 'pygments' library. Sphinx uses this library
to highlight code blocks.
So, RHEL7.3 contains package 'python-pygments-2.0.2', but lexers 'ps1con' and
'doscon' was introduced only in 'pygments-2.1'. That is why build fails.

'''
class pygments.lexers.shell.MSDOSSessionLexer
Short names:doscon
Filenames:  None
MIME types: None

Lexer for simplistic MSDOS sessions.

New in version 2.1.

class pygments.lexers.shell.PowerShellSessionLexer
Short names:ps1con
Filenames:  None
MIME types: None

Lexer for simplistic Windows PowerShell sessions.

New in version 2.1.
'''

On page [1] of 'pygments' project you can check the minimal version required
for every lexer.

Maybe we need to add minimal version of 'pygments' to requirements.txt .
In this case we will be able to create a whitelist of all supported lexers.

Another option:
Do we need the code highlighting at all?
We can just replace all the '.. code-block:: ' with simple '::' [2].
In this case, we will not have any external dependencies other than sphinx.

P.S. My previous patch [3] is just about ability to build documentation
 with sphinx 1.1 because there is no any reason to block it.

[1] http://pygments.org/docs/lexers/
[2] http://www.sphinx-doc.org/en/stable/rest.html#source-code
[3] https://mail.openvswitch.org/pipermail/ovs-dev/2017-March/329590.html

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] Documentation: Report errors for use of features not in Sphinx 1.1.3.

2017-03-09 Thread Ilya Maximets
On 07.03.2017 21:54, Ben Pfaff wrote:
> Signed-off-by: Ben Pfaff <b...@ovn.org>
> Acked-by: Stephen Finucane <step...@that.guru>
> ---
>  Documentation/automake.mk  | 15 ++-
>  Documentation/sphinx-version-blacklist |  2 ++
>  2 files changed, 16 insertions(+), 1 deletion(-)
>  create mode 100644 Documentation/sphinx-version-blacklist
> 
> diff --git a/Documentation/automake.mk b/Documentation/automake.mk
> index a74807fde532..f7f1fe61d1b7 100644
> --- a/Documentation/automake.mk
> +++ b/Documentation/automake.mk
> @@ -86,7 +86,8 @@ EXTRA_DIST += \
>   Documentation/internals/contributing/documentation-style.rst \
>   Documentation/internals/contributing/libopenvswitch-abi.rst \
>   Documentation/internals/contributing/submitting-patches.rst \
> - Documentation/requirements.txt
> + Documentation/requirements.txt \
> + Documentation/sphinx-version-blacklist
>  
>  # You can set these variables from the command line.
>  SPHINXOPTS =
> @@ -120,3 +121,15 @@ endif
>  .PHONY: htmldocs
>  .PHONY: check-docs
>  .PHONY: clean-docs
> +
> +ALL_LOCAL += sphinx-version-check
> +sphinx-version-check: $(EXTRA_DIST)
> + @if grep -n -f $(srcdir)/Documentation/sphinx-version-blacklist $?; \
> + then \
> +   echo "See above for list of uses of features that Sphinx 1.1.3"; \
> +   echo "does not support.  Please avoid using these features.."; \
> +   exit 1; \
> + else \
> +  : > $@; \
> + fi
> +CLEANFILES += sphinx-version-check
> diff --git a/Documentation/sphinx-version-blacklist 
> b/Documentation/sphinx-version-blacklist
> new file mode 100644
> index ..a67339bf2758
> --- /dev/null
> +++ b/Documentation/sphinx-version-blacklist
> @@ -0,0 +1,2 @@
> +code-block:: *ps1con
> +code-block:: *doscon

I don't feel this patch is fully correct, because it's not the features of
sphinx. And its version not really connected with version of 'pygments' library.

What do you think?

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH RFC] doc: Decrease build requirements to support RHEL7.

2017-03-07 Thread Ilya Maximets
Sphynx 1.1.3 on RHEL7 is able to properly build the documentation.
One last thing is that 'ps1con' and 'doscon' lexers are not
supported by available python-pygments-2.0.2. Changing them
to 'ps1' and 'console' accordingly doesn't make many differencies.

Sphinx discovering fixed because 'sphinx-build v1.1.3' doesn't
support '--version' option.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---

 Note:
Marked as RFC because I don't know sphinx well.

 Documentation/conf.py   |  2 +-
 Documentation/intro/install/windows.rst | 70 -
 Documentation/requirements.txt  |  2 +-
 m4/openvswitch.m4   |  2 +-
 4 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/Documentation/conf.py b/Documentation/conf.py
index 389ef70..5909669 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -30,7 +30,7 @@ except ImportError:
 
 # If your documentation needs a minimal Sphinx version, state it here.
 #
-needs_sphinx = '1.2'
+needs_sphinx = '1.1'
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
diff --git a/Documentation/intro/install/windows.rst 
b/Documentation/intro/install/windows.rst
index caa9f40..d78a442 100644
--- a/Documentation/intro/install/windows.rst
+++ b/Documentation/intro/install/windows.rst
@@ -225,7 +225,7 @@ building on Linux, FreeBSD, or NetBSD.
   all MinGW sessions and then run the below command from MSVC developers
   command prompt.:
 
-  .. code-block:: doscon
+  .. code-block:: console
 
  > mingw-get upgrade msys-core-bin=1.0.17-1
 
@@ -276,7 +276,7 @@ Now run ``./uninstall.cmd`` to remove the old extension. 
Once complete, run
 turn on ``TESTSIGNING`` boot option or 'Disable Driver Signature
 Enforcement' during boot.  The following commands can be used:
 
-.. code-block:: doscon
+.. code-block:: console
 
> bcdedit /set LOADOPTIONS DISABLE_INTEGRITY_CHECKS
> bcdedit /set TESTSIGNING ON
@@ -294,7 +294,7 @@ to work (covered later).
 The command to create a new switch named 'OVS-Extended-Switch' using a physical
 NIC named 'Ethernet 1' is:
 
-.. code-block:: ps1con
+.. code-block:: ps1
 
PS > New-VMSwitch "OVS-Extended-Switch" -NetAdapterName "Ethernet 1"
 
@@ -307,7 +307,7 @@ In the properties of any switch, you should should now see 
"Open vSwitch
 Extension" under 'Extensions'.  Click the check box to enable the extension.
 An alternative way to do the same is to run the following command:
 
-.. code-block:: ps1con
+.. code-block:: ps1
 
PS > Enable-VMSwitchExtension "Open vSwitch Extension" OVS-Extended-Switch
 
@@ -330,7 +330,7 @@ database, ovsdb-server. Each machine on which Open vSwitch 
is installed should
 run its own copy of ovsdb-server. Before ovsdb-server itself can be started,
 configure a database that it can use:
 
-.. code-block:: doscon
+.. code-block:: console
 
> ovsdb-tool create C:\openvswitch\etc\openvswitch\conf.db \
C:\openvswitch\usr\share\openvswitch\vswitch.ovsschema
@@ -338,7 +338,7 @@ configure a database that it can use:
 Configure ovsdb-server to use database created above and to listen on a Unix
 domain socket:
 
-.. code-block:: doscon
+.. code-block:: console
 
> ovsdb-server -vfile:info --remote=punix:db.sock --log-file \
--pidfile --detach
@@ -351,7 +351,7 @@ Initialize the database using ovs-vsctl. This is only 
necessary the first time
 after you create the database with ovsdb-tool, though running it at any time is
 harmless:
 
-.. code-block:: doscon
+.. code-block:: console
 
> ovs-vsctl --no-wait init
 
@@ -359,14 +359,14 @@ harmless:
 
If you would later like to terminate the started ovsdb-server, run:
 
-   .. code-block:: doscon
+   .. code-block:: console
 
   > ovs-appctl -t ovsdb-server exit
 
 Start the main Open vSwitch daemon, telling it to connect to the same Unix
 domain socket:
 
-.. code-block:: doscon
+.. code-block:: console
 
> ovs-vswitchd -vfile:info --log-file --pidfile --detach
 
@@ -374,7 +374,7 @@ domain socket:
 
If you would like to terminate the started ovs-vswitchd, run:
 
-   .. code-block:: doscon
+   .. code-block:: console
 
   > ovs-appctl exit
 
@@ -394,7 +394,7 @@ Add bridges
 Let's start by creating an integration bridge, ``br-int`` and a PIF bridge,
 ``br-pif``:
 
-.. code-block:: doscon
+.. code-block:: console
 
> ovs-vsctl add-br br-int
> ovs-vsctl add-br br-pif
@@ -408,7 +408,7 @@ Let's start by creating an integration bridge, ``br-int`` 
and a PIF bridge,
 
 Validate that ports are added by dumping from both ovs-dpctl and ovs-vsctl:
 
-.. code-block:: doscon
+.. code-block:: console
 
> ovs-dpctl show
system@ovs-system:
@@ -457,7 +457,7 @@ enable them and set the corresponding values to it to make 
them IP-able.
 
 As a who

Re: [ovs-dev] [PATCH] dpdk: Redirect DPDK log to OVS logging subsystem.

2017-03-05 Thread Ilya Maximets
On 02.03.2017 22:22, Aaron Conole wrote:
> Ilya Maximets <i.maxim...@samsung.com> writes:
> 
>> This should be helpful for have all the logs in one place.
>> 'ovs-appctl vlog' commands for 'dpdk' module can be used
>> to configure the log level. Lower bound for DPDK logging
>> (--log-level) still can be passed through 'dpdk-extra' field.
>>
>> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
>> ---
> 
> +1 - good change!

Thanks.

> ...
>> diff --git a/lib/dpdk.c b/lib/dpdk.c
>> index c1626e2..eb03ec9 100644
>> --- a/lib/dpdk.c
>> +++ b/lib/dpdk.c
> ...
>> @@ -262,6 +266,45 @@ argv_release(char **dpdk_argv, char 
>> **dpdk_argv_release, size_t dpdk_argc)
>>  free(dpdk_argv);
>>  }
>>  
>> +static ssize_t
>> +dpdk_log_write(void *c OVS_UNUSED, const char *buf, size_t size)
>> +{
>> +char *str = xmalloc(size + 1);
>> +
>> +strncpy(str, buf, size);
>> +str[size] = '\0';
> 
> Small nit - does it make more sense here to use xmemdup0(), instead?  If
> you're not worried about non-printable characters, what about xstrdup or
> even xasprintf("%s", buf)?

Good point. I've sent v2 with 'xmemdup0()'.

P.S. String functions can't be used here. Otherwise, where will be no
 need for copy at all.

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] dpdk: Redirect DPDK log to OVS logging subsystem.

2017-03-05 Thread Ilya Maximets
This should be helpful for have all the logs in one place.
'ovs-appctl vlog' commands for 'dpdk' module can be used
to configure the log level. Lower bound for DPDK logging
(--log-level) still can be passed through 'dpdk-extra' field.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
Version 2:
* 'xmemdup0' used inside log function.

 NEWS   |  5 +
 lib/dpdk.c | 48 
 2 files changed, 53 insertions(+)

diff --git a/NEWS b/NEWS
index ce9fe88..8d4af9e 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,11 @@ Post-v2.7.0
`egress_pkt_mark` OVSDB option.
- EMC insertion probability is reduced to 1% and is configurable via
  the new 'other_config:emc-insert-inv-prob' option.
+   - DPDK:
+ * DPDK log messages redirected to OVS logging subsystem.
+   Log level can be changed in a usual OVS way using
+   'ovs-appctl vlog' commands for 'dpdk' module. Lower bound
+   still can be configured via extra arguments for DPDK EAL.
 
 v2.7.0 - xx xxx 
 -
diff --git a/lib/dpdk.c b/lib/dpdk.c
index c1626e2..c458744 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -17,10 +17,12 @@
 #include 
 #include "dpdk.h"
 
+#include 
 #include 
 #include 
 #include 
 
+#include 
 #include 
 #ifdef DPDK_PDUMP
 #include 
@@ -36,6 +38,8 @@
 
 VLOG_DEFINE_THIS_MODULE(dpdk);
 
+static FILE *log_stream = NULL;   /* Stream for DPDK log redirection */
+
 static char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets */
 
 static int
@@ -262,6 +266,42 @@ argv_release(char **dpdk_argv, char **dpdk_argv_release, 
size_t dpdk_argc)
 free(dpdk_argv);
 }
 
+static ssize_t
+dpdk_log_write(void *c OVS_UNUSED, const char *buf, size_t size)
+{
+char *str = xmemdup0(buf, size);
+
+switch (rte_log_cur_msg_loglevel()) {
+case RTE_LOG_DEBUG:
+VLOG_DBG("%s", str);
+break;
+case RTE_LOG_INFO:
+case RTE_LOG_NOTICE:
+VLOG_INFO("%s", str);
+break;
+case RTE_LOG_WARNING:
+VLOG_WARN("%s", str);
+break;
+case RTE_LOG_ERR:
+VLOG_ERR("%s", str);
+break;
+case RTE_LOG_CRIT:
+case RTE_LOG_ALERT:
+case RTE_LOG_EMERG:
+VLOG_EMER("%s", str);
+break;
+default:
+OVS_NOT_REACHED();
+}
+
+free(str);
+return size;
+}
+
+static cookie_io_functions_t dpdk_log_func = {
+.write = dpdk_log_write,
+};
+
 static void
 dpdk_init__(const struct smap *ovs_other_config)
 {
@@ -273,6 +313,14 @@ dpdk_init__(const struct smap *ovs_other_config)
 cpu_set_t cpuset;
 char *sock_dir_subcomponent;
 
+log_stream = fopencookie(NULL, "w+", dpdk_log_func);
+if (log_stream == NULL) {
+VLOG_ERR("Can't redirect DPDK log: %s.", ovs_strerror(errno));
+} else {
+setbuf(log_stream, NULL);
+rte_openlog_stream(log_stream);
+}
+
 if (process_vhost_flags("vhost-sock-dir", ovs_rundir(),
 NAME_MAX, ovs_other_config,
 _dir_subcomponent)) {
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] dpif-netdev: Simplify emc replacement policy.

2017-07-28 Thread Ilya Maximets
Current EMC replacement policy allows to replace active EMC entry
even if there are dead (empty) entries available. This leads to
EMC trashing even on few hundreds of flows. In some cases PMD
threads starts to execute classifier lookups even in tests with
50 - 100 active flows.

Fix this by removing of srtange hash comparison rule from the
replacement checking. New behavior also matches the comment that
describes replacement policy. This comment wasn't correct before.

Testing shows stable work of exact match cache without misses
with up to 3072 active flows and only 0.05% of EMC misses with
4096 flows. With higher number of flows there is no significant
difference with current implementation.

For the reference, number of EMC misses in current master is
around 20% for the case with 2048 active flows.

Testing performed with 100% EMC insert probability.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/dpif-netdev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 47a9fa0..4a8dd80 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2054,8 +2054,7 @@ emc_insert(struct emc_cache *cache, const struct 
netdev_flow_key *key,
  * in the first entry where it can be */
 if (!to_be_replaced
 || (emc_entry_alive(to_be_replaced)
-&& !emc_entry_alive(current_entry))
-|| current_entry->key.hash < to_be_replaced->key.hash) {
+&& !emc_entry_alive(current_entry))) {
 to_be_replaced = current_entry;
 }
 }
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [patch_v1] dpdk: Fix device cleanup.

2017-07-28 Thread Ilya Maximets
In general, I have no objections to return 'detach' appctl command.

Comments inline.

On 26.07.2017 08:09, Darrell Ball wrote:
> Commit 5dcde09c80a8 was introduced to make detaching more
> automatic without using an additional command.
> 
> Sometimes, since commit 5dcde09c80a8, dpdk devices are
> not detached when del-port is issued; command example:
> 
> sudo ovs-vsctl del-port br0 dpdk1
> 
> This can happen when vswitchd is (re)started with an existing
> database and devices are already bound to dpdk.
> 
> A discussion is here:
> https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333462.html
> along with a possible solution.
> 
> Since we are nearing the end of a release, a safe approach is needed,
> at this time.
> One approach is to revert 5dcde09c80a8.  This patch does not do that
> but reinstates the command ovs-appctl netdev-dpdk/detach to handle
> cases when del-port will not work.
> 
> Fixes: 5dcde09c80a8 ("netdev-dpdk: Fix device leak on port deletion.")
> CC: Ilya Maximets <i.maxim...@samsung.com>
> Signed-off-by: Darrell Ball <dlu...@gmail.com>
> ---
>  Documentation/howto/dpdk.rst | 12 ++
>  lib/netdev-dpdk.c| 52 
> +++-
>  2 files changed, 63 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
> index af01d3e..3c198a2 100644
> --- a/Documentation/howto/dpdk.rst
> +++ b/Documentation/howto/dpdk.rst
> @@ -331,6 +331,18 @@ Detaching will be performed while processing del-port 
> command::
>  
>  $ ovs-vsctl del-port dpdkx
>  
> +Sometimes, the del-port command may not detach the device.
> +Detaching can be confirmed by the appearance of an INFO log.
> +For example::
> +
> +Device ':04:00.0' has been detached
> +
> +If the log is not seen, then the port can be detached using::
> +
> +$ ovs-appctl netdev-dpdk/detach :01:00.0
> +
> +Again, detaching can be confirmed by the above INFO log.
> +
>  This feature is not supported with VFIO and does not work with some NICs.

I think, we need to clarify here that attempt to detach of not detachable
device will lead to closing that device and possible inability to re-add it
back to OVS. Even if this device will be added successfully after execution
of 'ovs-vsctl add-port' it may not work properly (possible crashes or HW 
faults).

For example, 'cxgbe' DPDK driver can not be restored after the 
rte_eth_dev_close().

>  For more information please refer to the `DPDK Port Hotplug Framework
>  
> <http://dpdk.org/doc/guides/prog_guide/port_hotplug_framework.html#hotplug>`__.
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index ea17b97..812d262 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -1013,7 +1013,7 @@ netdev_dpdk_destruct(struct netdev *netdev)
>  if (rte_eth_dev_detach(dev->port_id, devname) < 0) {
>  VLOG_ERR("Device '%s' can not be detached", dev->devargs);
>  } else {
> -VLOG_INFO("Device '%s' detached", devname);
> +VLOG_INFO("Device '%s' has been detached", devname);
>  }
>  }
>  
> @@ -2449,6 +2449,53 @@ netdev_dpdk_set_admin_state(struct unixctl_conn *conn, 
> int argc,
>  unixctl_command_reply(conn, "OK");
>  }
>  
> +static void
> +netdev_dpdk_detach(struct unixctl_conn *conn, int argc OVS_UNUSED,
> +   const char *argv[], void *aux OVS_UNUSED)
> +{
> +int ret;
> +char *response;
> +uint8_t port_id;
> +char devname[RTE_ETH_NAME_MAX_LEN];
> +struct netdev_dpdk *dev;
> +
> +ovs_mutex_lock(_mutex);
> +
> +if (!rte_eth_dev_count() || rte_eth_dev_get_port_by_name(argv[1],
> + _id)) {
> +response = xasprintf("Device '%s' not found in DPDK", argv[1]);
> +goto error;
> +}
> +
> +dev = netdev_dpdk_lookup_by_port_id(port_id);
> +if (dev) {
> +response = xasprintf("Device '%s' is being used by interface '%s'. "
> + "Remove it before detaching",
> + argv[1], netdev_get_name(>up));
> +goto error;
> +}
> +
> +rte_eth_dev_close(port_id);
> +
> +ret = rte_eth_dev_detach(port_id, devname);
> +if (ret < 0) {
> +response = xasprintf("Device '%s' can not be detached", argv[1]);
> +goto error;
> +}
> +
> +response = xasprintf("Device '%s' has been detached", argv[1]);
> +
> +ovs_mutex_unlock(_mutex);
> +   

Re: [ovs-dev] [PATCH RFC v2 4/4] dpif-netdev: Time based output batching.

2017-07-31 Thread Ilya Maximets
On 28.07.2017 10:20, Darrell Ball wrote:
> I have not tested yet
> 
> However, I would have expected something max latency config. to be specific 
> to netdev-dpdk port types

IMHO, if we can make it generic, we must make it generic. Making of this
functionality netdev-dpdk specific will brake ability to test it using
unit tests. As the change is complex and has a lot of pitfalls like
possible packet stucks and possible latency issues, this code should be
covered by unit tests to simplify the support and modifications.
(And it's already partly covered because it is generic. And I fixed many
minor issues while developing through unit test failures.)

In the future this can be used also to improve performance of netdev-linux
by replacing sendmsg() with batched sendmmsg(). This should significantly
increase performance of flood actions while MACs are not learned yet in
action NORMAL.

> This type of code also seems to intersect with present and future QoS 
> considerations in netdev-dpdk

Maybe, but there are also some related features in mail-list like rx queue
prioritization which are implemented in generic way on dpif-netdev layer.

> 
> -Original Message-----
> From: Ilya Maximets <i.maxim...@samsung.com>
> Date: Wednesday, July 26, 2017 at 8:21 AM
> To: "ovs-dev@openvswitch.org" <ovs-dev@openvswitch.org>, Bhanuprakash 
> Bodireddy <bhanuprakash.bodire...@intel.com>
> Cc: Heetae Ahn <heetae82@samsung.com>, Ben Pfaff <b...@ovn.org>, Antonio 
> Fischetti <antonio.fische...@intel.com>, Eelco Chaudron 
> <echau...@redhat.com>, Ciara Loftus <ciara.lof...@intel.com>, Kevin Traynor 
> <ktray...@redhat.com>, Darrell Ball <db...@vmware.com>, Ilya Maximets 
> <i.maxim...@samsung.com>
> Subject: [PATCH RFC v2 4/4] dpif-netdev: Time based output batching.
> 
> This allows to collect packets from more than one RX burst
> and send them together with a configurable maximum latency.
> 
> 'other_config:output-max-latency' can be used to configure
> time that a packet can wait in output batch for sending.
> 
> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
> ---
> 
> millisecon granularity is used for now. Can be easily switched to use
> microseconds instead.
> 
>  lib/dpif-netdev.c| 97 
> +++-
>  vswitchd/vswitch.xml | 15 
>  2 files changed, 95 insertions(+), 17 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 07c7dad..e5f8a3d 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -84,6 +84,9 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>  #define MAX_RECIRC_DEPTH 5
>  DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
>  
> +/* Use instant packet send by default. */
> +#define DEFAULT_OUTPUT_MAX_LATENCY 0
> +
>  /* Configuration parameters. */
>  enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow 
> table. */
>  enum { MAX_METERS = 65536 };/* Maximum number of meters. */
> @@ -261,6 +264,9 @@ struct dp_netdev {
>  struct hmap ports;
>  struct seq *port_seq;   /* Incremented whenever a port changes. 
> */
>  
> +/* The time that a packet can wait in output batch for sending. */
> +atomic_uint32_t output_max_latency;
> +
>  /* Meters. */
>  struct ovs_mutex meter_locks[N_METER_LOCKS];
>  struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
> @@ -498,6 +504,7 @@ struct tx_port {
>  int qid;
>  long long last_used;
>  struct hmap_node node;
> +long long output_time;
>  struct dp_packet_batch output_pkts;
>  };
>  
> @@ -570,6 +577,9 @@ struct dp_netdev_pmd_thread {
>   * than 'cmap_count(dp->poll_threads)'. */
>  const int static_tx_qid;
>  
> +/* Number of filled output batches. */
> +int n_output_batches;
> +
>  struct ovs_mutex port_mutex;/* Mutex for 'poll_list' and 
> 'tx_ports'. */
>  /* List of rx queues to poll. */
>  struct hmap poll_list OVS_GUARDED;
> @@ -663,9 +673,9 @@ static void dp_netdev_add_rxq_to_pmd(struct 
> dp_netdev_pmd_thread *pmd,
>  static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
> struct rxq_poll *poll)
>  OVS_REQUIRES(pmd->port_mutex);
> -static void
> +static int
>  dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
> -   long long now);
> +

Re: [ovs-dev] Openvswitch crash when bringing down the dpdk bond port using "ovs-ofctl mod-port br-prv dpdk1 down"

2017-07-31 Thread Ilya Maximets
On 31.07.2017 16:05, Ben Pfaff wrote:
> Ilya, should we apply this patch to branch-2.6?  Are there other patches
> that should be backported?

It's definitely a bug and should be backported if someone wants to
use barnch-2.6 with DPDK datapath. I traced only this exact case, so
it's hard to say if something else should be backported, but this
patch should fix described issue without  any additional changes.
 
> On Wed, Jul 26, 2017 at 03:28:12PM +0300, Ilya Maximets wrote:
>> Hi.
>>
>> You need to backport at least following patch:
>>
>> commit 3b1fb0779b87788968c1a6a9ff295a9883547485
>> Author: Daniele Di Proietto <diproiet...@vmware.com>
>> Date:   Tue Nov 15 15:40:49 2016 -0800
>>
>> netdev-dpdk: Don't call rte_dev_stop() in update_flags().
>> 
>> Calling rte_eth_dev_stop() while the device is running causes a crash.
>> 
>> We could use rte_eth_dev_set_link_down(), but not every PMD implements
>> that, and I found one NIC where that has no effect.
>> 
>> Instead, this commit checks if the device has the NETDEV_UP flag when
>> transmitting or receiving (similarly to what we do for vhostuser). I
>> didn't notice any performance difference with this check in case the
>> device is up.
>> 
>> An alternative would be to remove the device queues from the pmd threads
>> tx and receive cache, but that requires reconfiguration and I'd prefer
>> to avoid it, because the change can come from OpenFlow.
>> 
>> Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
>> Acked-by: Ilya Maximets <i.maxim...@samsung.com>
>>
>> This should fix your issue.
>> In general, I'm suggesting to use stable 2.7 OVS, there was too many DPDK
>> related changes including stability fixes since 2.6.
>>
>> Best regards, Ilya Maximets.
>>
>>> Hi
>>>   We are experiencing a openvswitch crash when bringing down the dpdk bond 
>>> port using "ovs-ofctl mod-port br-prv dpdk1 down".
>>>
>>> backtrace of core is like below. Is there any issue reported earlier  for 
>>> this type of crash in openvswitch community.
>>>
>>> (gdb) bt
>>> #0  ixgbe_rxq_rearm (rxq=0x7fa45061f800) at 
>>> /home/sdn/new_cloud_sdn_switch_2/cloud-sdn-switch/dpdk/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c:98
>>> #1  _recv_raw_pkts_vec (split_packet=0x0, nb_pkts=32, rx_pkts=>> out>, rxq=0x7fa45061f800)
>>> at 
>>> /home/sdn/new_cloud_sdn_switch_2/cloud-sdn-switch/dpdk/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c:290
>>> #2  ixgbe_recv_pkts_vec (rx_queue=0x7fa45061f800, rx_pkts=, 
>>> nb_pkts=)
>>> at 
>>> /home/sdn/new_cloud_sdn_switch_2/cloud-sdn-switch/dpdk/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c:474
>>> #3  0x00e500e4 in ?? ()
>>> #4  0x004600e6 in ?? ()
>>> #5  0x006a0069 in ?? ()
>>> #6  0x006c006b in ?? ()
>>> #7  0x00ec006d in ?? ()
>>> #8  0x00ee00ed in ?? ()
>>> #9  0x0001537f5780 in ?? ()
>>> #10 0x in ?? ()
>>> (gdb)
>>>
>>>
>>> I have analyzed the core and it seems it is a result of device stop and 
>>> packet receive from the port happening at same time by two thread
>>> OVS main thread(device stop) and PMD thread(pkt receive). More precisely 
>>> main thread cleaning the packet buffer from rxq sw_ring to avoid the
>>> packet buffer leak while in parallel PMD thread is filling the packet 
>>> buffer in sw_ring/descriptor ring as part of ixgbe_recv_pkts_vec.
>>>
>>> version used is: openvswitch (2.6.1) with dpdk (16.11).
>>>
>>> This crash is not every time reproducible but frequency seems to be high.
>>>
>>> I am new to openvswitch community and this is first time I am posting a 
>>> query. let me know if anything you require from my side.
>>>
>>> Thanks
>>> Keshav
>>
>> ___
>> dev mailing list
>> d...@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 
> 
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2 0/2] EMC management fixes.

2017-07-31 Thread Ilya Maximets
Ilya Maximets (2):
  dpif-netdev: Decrease range of values for EMC probability.
  dpif-netdev: Fix emc replacement policy.

 lib/dpif-netdev.c| 33 +++--
 vswitchd/vswitch.xml |  3 ++-
 2 files changed, 25 insertions(+), 11 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] dpif-netdev: Simplify emc replacement policy.

2017-07-31 Thread Ilya Maximets
On 31.07.2017 04:41, Darrell Ball wrote:
> 
> 
> -Original Message-
> From: <ovs-dev-boun...@openvswitch.org> on behalf of "Wang, Yipeng1" 
> <yipeng1.w...@intel.com>
> Date: Friday, July 28, 2017 at 11:04 AM
> To: Ilya Maximets <i.maxim...@samsung.com>, "ovs-dev@openvswitch.org" 
> <ovs-dev@openvswitch.org>
> Cc: Heetae Ahn <heetae82@samsung.com>
> Subject: Re: [ovs-dev] [PATCH] dpif-netdev: Simplify emc replacement policy.
> 
> Good catch. But I think the hash comparison is to "randomly" choose one 
> of the two entries to replace when both entries are live. 
> Your change would always replace the first one in such case. It might cause 
> some thrashing issue for certain traffic. Meanwhile, to my experience, the 
> original "hash comparison" is also not a good way to choose random entry, I 
> encountered some thrashing issue before.
> 
> I think we want some condition like below, but a way to fast choose a 
> random entry. 
> 
> if (!to_be_replaced || (emc_entry_alive(to_be_replaced) && 
> !emc_entry_alive(current_entry) )
> to_be_replaced = current_entry;
> else if((emc_entry_alive(to_be_replaced) && 
> (emc_entry_alive(current_entry))
>   to_be_replaced = random_entry;

I agree that we need to have something like random choosing of active entry to 
replace.
I though about this a little and came up with idea to reuse the random value 
generated
for insertion probability. This should give a good distribution for replacement.
I'll send v2 soon with that approach. 

> //
> 
> I agree – we are trying to randomly select one of two live entries with the 
> last condition.
> Something like this maybe makes it more clear what we are trying to do ?

Your code solves the issue with replacement of alive entries while dead ones 
exists,
but you're still uses hashes as random values which is not right. Hashes are 
not random
and there is no any difference in choosing the first entry or the entry with a 
bit
set in a particular place. There always will be some bad case where you will 
replace
same entries all the time and performance of EMC will be low.

> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 47a9fa0..75cc039 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -2051,12 +2051,15 @@ emc_insert(struct emc_cache *cache, const struct 
> netdev_flow_key *key,
>  }
>  
>  /* Replacement policy: put the flow in an empty (not alive) entry, or
> - * in the first entry where it can be */
> -if (!to_be_replaced
> -|| (emc_entry_alive(to_be_replaced)
> -&& !emc_entry_alive(current_entry))
> -|| current_entry->key.hash < to_be_replaced->key.hash) {
> + * randomly choose one of the two alive entries to be replaced. */
> +if (!to_be_replaced) {
>  to_be_replaced = current_entry;
> +} else if (emc_entry_alive(to_be_replaced) && 
> !emc_entry_alive(current_entry)) {
> +to_be_replaced = current_entry;
> +} else if (emc_entry_alive(to_be_replaced) && 
> emc_entry_alive(current_entry)) {
> +if (current_entry->key.hash & 0x1) {
> +to_be_replaced = current_entry;
> +    }
>  }
>  }
>  /* We didn't find the miniflow in the cache.
> 
> //
> 
> Thanks
> Yipeng
> 
> > -Original Message-
> > From: ovs-dev-boun...@openvswitch.org [mailto:ovs-dev-
> > boun...@openvswitch.org] On Behalf Of Ilya Maximets
> > Sent: Friday, July 28, 2017 5:41 AM
> > To: ovs-dev@openvswitch.org
> > Cc: Ilya Maximets <i.maxim...@samsung.com>; Heetae Ahn
> > <heetae82@samsung.com>
> > Subject: [ovs-dev] [PATCH] dpif-netdev: Simplify emc replacement policy.
> > 
> > Current EMC replacement policy allows to replace active EMC entry
> > even if there are dead (empty) entries available. This leads to
> > EMC trashing even on few hundreds of flows. In some cases PMD
> > threads starts to execute classifier lookups even in tests with
> > 50 - 100 active flows.
> > 
> > Fix this by removing of srtange hash comparison rule from the
> > replacement checking. New behavior also matches the comment that
> > describes replacement policy. This comment wasn't correct before.
> > 
> > Testing shows stable work of exact match cache without misses
> > with up to 3072 active flows and only 0.05% of EMC misses

[ovs-dev] [PATCH v2 1/2] dpif-netdev: Decrease range of values for EMC probability.

2017-07-31 Thread Ilya Maximets
Currently, real insertion probability is higher than configured
for the maximum case because of wrong usage of the random value.

i.e. if 'emc-invert-inv-prob' == UINT32_MAX, then 'emc_insert_min'
equals to 1. In this case we're allowing insert if random vailue
is less or equal to 1. So, two of 2**32 values (0 and 1) are
allowed and real probability is 2 times higher than configured.

This happens because 'random_uint32()' returns value in range
[0; UINT32_MAX], but for the checking to be correct we should
generate random value in range [0; UINT32_MAX - 1].

To fix this we have 4 possible solutions:

 1. need to use uint64_t for 'emc-insert-min' and calculate it
as '(UINT32_MAX + 1) / inverse_prob' to fairly check the full
range [0; UINT32_MAX].

This may decrease performance becaue of 64 bit atomic ops.

 2. Forbid the '2**32 - 1' as the value for 'emc-insert-min'
because it's the only value we have issues with.

This will require additional explanations and not very friendly
for users.

 3. Generate random value in range [0; UINT32_MAX - 1].

This will require heavy division operation.

 4. Decrease the range of available values for 'emc-insert-inv-prob'.

Actually, we don't need to have so much different values for
that option. I beleve that values higher than 1M are completely
useless. Choosing the upper limit as a power of 2 like 2**20 we
will be able to mask the generated random value in a fast way
and also avoid range issue, because same uint32_t can be used to
store 2**20.

This patch implements solution #4.

CC: Ciara Loftus <ciara.lof...@intel.com>
Fixes: 4c30b24602c3 ("dpif-netdev: Conditional EMC insert")
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---

Infrastructure and logic introduced here will be used for fixing
emc replacement policy.

 lib/dpif-netdev.c| 12 
 vswitchd/vswitch.xml |  3 ++-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 47a9fa0..123a7c9 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -152,9 +152,12 @@ struct netdev_flow_key {
 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
 #define EM_FLOW_HASH_SEGS 2
 
+#define EM_FLOW_INSERT_INV_PROB_SHIFT 20
+#define EM_FLOW_INSERT_INV_PROB_MAX  (1 << EM_FLOW_INSERT_INV_PROB_SHIFT)
+#define EM_FLOW_INSERT_INV_PROB_MASK (EM_FLOW_INSERT_INV_PROB_MAX - 1)
 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
-#define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
+#define DEFAULT_EM_FLOW_INSERT_MIN (EM_FLOW_INSERT_INV_PROB_MAX /\
 DEFAULT_EM_FLOW_INSERT_INV_PROB)
 
 struct emc_entry {
@@ -2077,7 +2080,7 @@ emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
 uint32_t min;
 atomic_read_relaxed(>dp->emc_insert_min, );
 
-if (min && random_uint32() <= min) {
+if (min && (random_uint32() & EM_FLOW_INSERT_INV_PROB_MASK) < min) {
 emc_insert(>flow_cache, key, flow);
 }
 }
@@ -2894,8 +2897,9 @@ dpif_netdev_set_config(struct dpif *dpif, const struct 
smap *other_config)
 }
 
 atomic_read_relaxed(>emc_insert_min, _min);
-if (insert_prob <= UINT32_MAX) {
-insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
+if (insert_prob < EM_FLOW_INSERT_INV_PROB_MAX) {
+insert_min = insert_prob == 0
+ ? 0 : EM_FLOW_INSERT_INV_PROB_MAX / insert_prob;
 } else {
 insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
 insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 074535b..61f252e 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -381,7 +381,8 @@
   
 
   
+  type='{"type": "integer",
+ "minInteger": 0, "maxInteger": 1048575}'>
 
   Specifies the inverse probability (1/emc-insert-inv-prob) of a flow
   being inserted into the Exact Match Cache (EMC). On average one in
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 0/4] Output packet batching.

2017-08-02 Thread Ilya Maximets
Hi Bhanuprakash,
Thanks for testing. Comments inline.

Best regards, Ilya Maximets.

On 01.08.2017 18:33, Bodireddy, Bhanuprakash wrote:
>> This patch-set inspired by [1] from Bhanuprakash Bodireddy.
>> Implementation of [1] looks very complex and introduces many pitfalls for
>> later code modifications like possible packet stucks.
>>
>> This version targeted to make simple and flexible output packet batching on
>> higher level without introducing and even simplifying netdev layer.
>>
>> Patch set consists of 3 patches. All the functionality introduced in the 
>> first
>> patch. Two others are just cleanups of netdevs to not do unnecessary things.
>>
>> Basic testing of 'PVP with OVS bonding on phy ports' scenario shows
>> significant performance improvement.
>> More accurate and intensive testing required.
>>
>> [1] [PATCH 0/6] netdev-dpdk: Use intermediate queue during packet
>> transmission.
>>https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/334762.html
>>
>> Version 2:
>>
>>  * Rebased on current master.
>>  * Added time based batching RFC patch.
>>  * Fixed mixing packets with different sources in same batch.
>>
> 
> Applied this series along with other patches[1] and gave initial try.
> With this series, approximately half a million throughput drop is observed in 
> simple test case (P2P - 1stream - udp) vs  master + [1].

Could you please describe packet sizes and the overall throughput of this 
scenario,
because it's completely unclear what is the relative throughput difference.

Also, was RFC patch applied too? I'm suggesting performance testing without this
patch because it was only functionally tested (that is why it was sent as RFC).
And your solution of netdev layer doesn't have such functionality right now,
so it will be a little unfair if we'll try to compare them later.

Beside that there are few possible performance optimizations that can be applied
to current implementation of output batching. One of them is not to check
boundaries each time we're trying to add packet to the batch. This happens
now in each call of 'dp_packet_batch_add()'. We can manually add packets without
checking because correctness assured by separate size check. Like this:

---
diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index c5fe32e..f624db7 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -655,11 +655,18 @@ dp_packet_batch_init(struct dp_packet_batch *batch)
 }
 
 static inline void
+dp_packet_batch_add_unsafe(struct dp_packet_batch *batch,
+   struct dp_packet *packet)
+{
+batch->packets[batch->count++] = packet;
+}
+
+static inline void
 dp_packet_batch_add__(struct dp_packet_batch *batch,
   struct dp_packet *packet, size_t limit)
 {
 if (batch->count < limit) {
-batch->packets[batch->count++] = packet;
+dp_packet_batch_add_unsafe(batch, packet);
 } else {
 dp_packet_delete(packet);
 }
@@ -734,7 +741,7 @@ dp_packet_batch_clone(struct dp_packet_batch *dst,
 
 dp_packet_batch_init(dst);
 DP_PACKET_BATCH_FOR_EACH (packet, src) {
-dp_packet_batch_add(dst, dp_packet_clone(packet));
+dp_packet_batch_add_unsafe(dst, dp_packet_clone(packet));
 }
 dst->trunc = src->trunc;
 }
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e5f8a3d..f0c09d6 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5166,7 +5166,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch 
*packets_,
 }
 
 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
-dp_packet_batch_add(>output_pkts, packet);
+dp_packet_batch_add_unsafe(>output_pkts, packet);
 }
 return;
 }
---

I will post dp_packet related part of this as a separate patch to clean up
current direct usages of internal batch structure. 

> The performance improvement is observed with multiple flows  (which this 
> series is meant to address).
> 
> At this stage no latency settings were used. Yet to review and do more 
> testing.
> 
> [1] Improves performance.
> https://mail.openvswitch.org/pipermail/ovs-dev/2017-July/335359.html
> https://mail.openvswitch.org/pipermail/ovs-dev/2017-July/336186.html
> https://mail.openvswitch.org/pipermail/ovs-dev/2017-July/336187.html
> https://mail.openvswitch.org/pipermail/ovs-dev/2017-July/336290.html
> 
> - Bhanuprakash.
> 
> 
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH RFC v2 4/4] dpif-netdev: Time based output batching.

2017-08-02 Thread Ilya Maximets
On 01.08.2017 18:45, Bodireddy, Bhanuprakash wrote:
>>On 28.07.2017 10:20, Darrell Ball wrote:
>>> I have not tested yet
>>>
>>> However, I would have expected something max latency config. to be
>> specific to netdev-dpdk port types
>>
>>IMHO, if we can make it generic, we must make it generic.
>>
>> [Darrell]
>> The first question I ask myself is -  is this functionality intrinsically 
>> generic or is
>> it not ?
>> It is clearly not

It's yours opinion. Do not give it as a well-known axiom.

>> and trying to make it artificially so would do the following:
>>
>> 1) We end up designing something the wrong way where it partially works.
>> 2) Breaks other features present and future that really do intersect.

The same applicable to netdev restricted features. Additionally we will have
to implement such functionality for each netdev separately if needed and
feature generic features that will intersect will be blocked due to one
particular netdev implementation. It's not clear what is worse.

In addition,
Such latency configuration can't be implemented without support from dpif.
This leads to the question:
Do we need specific for only one netdev code in generic dpif? We already have
too much dpdk specific code there.
Following your logic there should be separate DPDK datapath in OvS as it was
in earlier implementations from Intel.

>>
>> Making of this
>>functionality netdev-dpdk specific will brake ability to test it using
>>unit tests. As the change is complex and has a lot of pitfalls like
>>possible packet stucks and possible latency issues, this code should be
>>covered by unit tests to simplify the support and modifications.
>>(And it's already partly covered because it is generic. And I fixed many
>>minor issues while developing through unit test failures.)
>>
>> [Darrell]
>> Most of dpdk is not tested by our unit tests because it cannot be simulated
>> well at the moment.

It's not the argument to not test new features.

> This is orthogonal to the basic question however.
> 
> Darrell is right and the unit tests we have currently don't test DPDK 
> datapath well. 
> So having this changes in netdev layer shouldn't  impact the unit tests much.

Yes, and that is the issue. IMHO, if we can do it generic and test as all other
generic code, we should do it. There is no dependencies from dpdk library.
This actually means that functionality can be tested in current environment.
But making it netdev-specific will block this ability.
 
> While I share your other concern that changes in netdev layer will be little 
> complex and slightly
> painful for future code changes, this max latency config  introduced in dpif 
> layer may not hold good to
> different port types and users may potentially introduce conflicting changes 
> in netdev layer in future to
> suit their use cases.
>  
>>
>>
>>In the future this can be used also to improve performance of netdev-linux
>>by replacing sendmsg() with batched sendmmsg(). This should significantly
>>increase performance of flood actions while MACs are not learned yet in
>>action NORMAL.
>>
>>> This type of code also seems to intersect with present and future QoS
>> considerations in netdev-dpdk
> 
>>
>>Maybe, but there are also some related features in mail-list like rx queue
>>prioritization which are implemented in generic way on dpif-netdev layer.
> 
> If you are referring to rxq prioritization work by Billy 
> (https://mail.openvswitch.org/pipermail/ovs-dev/2017-July/336001.html),
> this feature is more implemented in netdev layer with very minimal updates to 
> dpif layer. 
> 
> BTW,  dp_execute_cb()  is getting cluttered with this patch. 
> 
> - Bhanuprakash.
> 
>>
>>>
>>> -Original Message-
>>> From: Ilya Maximets <i.maxim...@samsung.com>
>>> Date: Wednesday, July 26, 2017 at 8:21 AM
>>> To: "ovs-dev@openvswitch.org" <ovs-dev@openvswitch.org>,
>> Bhanuprakash Bodireddy <bhanuprakash.bodire...@intel.com>
>>> Cc: Heetae Ahn <heetae82@samsung.com>, Ben Pfaff
>> <b...@ovn.org>, Antonio Fischetti <antonio.fische...@intel.com>, Eelco
>> Chaudron <echau...@redhat.com>, Ciara Loftus <ciara.lof...@intel.com>,
>> Kevin Traynor <ktray...@redhat.com>, Darrell Ball <db...@vmware.com>,
>> Ilya Maximets <i.maxim...@samsung.com>
>>> Subject: [PATCH RFC v2 4/4] dpif-netdev: Time based output batching.
>>>
>>> This allows to colle

Re: [ovs-dev] DPDK Merge Repo

2017-08-02 Thread Ilya Maximets
Hi Darrell and Ben.

> Hi All
> 
> As mentioned before, I am using a repo for DPDK patch merging.
> The repo is here:
> https://github.com/darball/ovs/
> 
> There are still some outstanding patches from Bhanu that have not completed 
> review yet:
> 
> util: Add PADDED_MEMBERS_CACHELINE_MARKER macro to mark cachelines.- Bhanu
> packets: Reorganize the pkt_metadata structure. - Bhanu
> 
> and a series we would like to get into 2.8
> 
> netdev-dpdk: Use intermediate queue during packet transmission.  Bhanu Jun 
> 29/V3
> netdev: Add netdev_txq_flush function.
> netdev-dpdk: Add netdev_dpdk_txq_flush function.
> netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.
> netdev-dpdk: Add intermediate queue support.
> netdev-dpdk: Enable intermediate queue for vHost User port.
> dpif-netdev: Flush the packets in intermediate queue.

I think that we still not reached agreement about the level of implementation
(netdev-dpdk or dpif-netdev). Just few people participate in discussion which
is not very productive. I suggest not to target output batching for 2.8 release
because of this and also lack of testing and review.
As I understand, we have only 3 days merge window for the new features
and I expect that we can't finish discussion, review and testing in time.

> Please let me know if something else is approved but missed ?
> Anything else ?
> 
> Thanks Darrell


In addition I have a few general thoughts about merging via pull requests:

1. There is a requirement described in contribution guide that submitter
   must sign-off the patch. But merges on github doesn't work this way.
   So, the patches should be cherry-picked with footer modifications by
   submitter or contribution guide should be fixed to reflect pull
   request workflow. I understand that authorship of the merge commit can
   replace the sign-off somehow, but it's not so easy sometimes to find
   the corresponding merge commit for particular change. And this still
   doesn't mean that submitter agree with Developer's Certificate of Origin.

2. I'm a fan of plain git history. Could we use 'Rebase and merge' policy
   without merge commits ?
   https://github.com/blog/2243-rebase-and-merge-pull-requests

PS: Ben, I'm sorry for the typo in your name in my previous letter.

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v6 0/2] conntrack : Add support for rx checksum offload.

2017-08-02 Thread Ilya Maximets
Hi Sugesh,

It's not a review. Just general patch style comment.
Please limit lines in the patch description to 75 characters in width
as described in contribution guide. Long lines looks bad.

Darrell, Ban, I saw that these patches applied to darball/ovs github
and I'm asking to not merge until descriptions fixed to have consistent
and pretty log.

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH RFC v3 4/4] dpif-netdev: Time based output batching.

2017-08-10 Thread Ilya Maximets
This allows to collect packets from more than one RX burst
and send them together with a configurable maximum latency.

'other_config:output-max-latency' can be used to configure
time that a packet can wait in output batch for sending.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
Notes:

* This is an RFC and should not be used for performance testing.
* Millisecond granularity is used for now. Can be easily switched
  to use microseconds instead.

 lib/dpif-netdev.c| 121 ++-
 vswitchd/vswitch.xml |  15 +++
 2 files changed, 115 insertions(+), 21 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index dcf55f3..0d78ae4 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -85,6 +85,9 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev);
 #define MAX_RECIRC_DEPTH 5
 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
 
+/* Use instant packet send by default. */
+#define DEFAULT_OUTPUT_MAX_LATENCY 0
+
 /* Configuration parameters. */
 enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
 enum { MAX_METERS = 65536 };/* Maximum number of meters. */
@@ -262,6 +265,9 @@ struct dp_netdev {
 struct hmap ports;
 struct seq *port_seq;   /* Incremented whenever a port changes. */
 
+/* The time that a packet can wait in output batch for sending. */
+atomic_uint32_t output_max_latency;
+
 /* Meters. */
 struct ovs_mutex meter_locks[N_METER_LOCKS];
 struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
@@ -502,6 +508,7 @@ struct tx_port {
 int qid;
 long long last_used;
 struct hmap_node node;
+long long output_time;
 struct dp_packet_batch output_pkts;
 };
 
@@ -574,6 +581,9 @@ struct dp_netdev_pmd_thread {
  * than 'cmap_count(dp->poll_threads)'. */
 uint32_t static_tx_qid;
 
+/* Number of filled output batches. */
+int n_output_batches;
+
 struct ovs_mutex port_mutex;/* Mutex for 'poll_list' and 'tx_ports'. */
 /* List of rx queues to poll. */
 struct hmap poll_list OVS_GUARDED;
@@ -669,9 +679,9 @@ static void dp_netdev_add_rxq_to_pmd(struct 
dp_netdev_pmd_thread *pmd,
 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
struct rxq_poll *poll)
 OVS_REQUIRES(pmd->port_mutex);
-static void
+static int
 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
-   long long now);
+   long long now, bool force);
 static void reconfigure_datapath(struct dp_netdev *dp)
 OVS_REQUIRES(dp->port_mutex);
 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
@@ -1193,6 +1203,7 @@ create_dp_netdev(const char *name, const struct 
dpif_class *class,
 conntrack_init(>conntrack);
 
 atomic_init(>emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
+atomic_init(>output_max_latency, DEFAULT_OUTPUT_MAX_LATENCY);
 
 cmap_init(>poll_threads);
 
@@ -2858,7 +2869,7 @@ dpif_netdev_execute(struct dpif *dpif, struct 
dpif_execute *execute)
 dp_packet_batch_init_packet(, execute->packet);
 dp_netdev_execute_actions(pmd, , false, execute->flow,
   execute->actions, execute->actions_len, now);
-dp_netdev_pmd_flush_output_packets(pmd, now);
+dp_netdev_pmd_flush_output_packets(pmd, now, true);
 
 if (pmd->core_id == NON_PMD_CORE_ID) {
 ovs_mutex_unlock(>non_pmd_mutex);
@@ -2907,6 +2918,16 @@ dpif_netdev_set_config(struct dpif *dpif, const struct 
smap *other_config)
 smap_get_ullong(other_config, "emc-insert-inv-prob",
 DEFAULT_EM_FLOW_INSERT_INV_PROB);
 uint32_t insert_min, cur_min;
+uint32_t output_max_latency, cur_max_latency;
+
+output_max_latency = smap_get_int(other_config, "output-max-latency",
+  DEFAULT_OUTPUT_MAX_LATENCY);
+atomic_read_relaxed(>output_max_latency, _max_latency);
+if (output_max_latency != cur_max_latency) {
+atomic_store_relaxed(>output_max_latency, output_max_latency);
+VLOG_INFO("Output maximum latency set to %"PRIu32" ms",
+  output_max_latency);
+}
 
 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
 free(dp->pmd_cmask);
@@ -3107,11 +3128,12 @@ cycles_count_intermediate(struct dp_netdev_pmd_thread 
*pmd,
 non_atomic_ullong_add(>cycles.n[type], interval);
 }
 
-static void
+static int
 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
struct tx_port *p, long long now)
 {
 int tx_qid;
+int output_cnt;
 bool dynamic_txqs;
 
 dynamic_txqs = p->port->dynamic_txqs;
@@ -3121,21 +3143,39 @@ dp_netdev_pmd_flush_output_on_port(struct 
dp_netdev_pmd_thread *pmd,
 tx_qid = pmd->s

[ovs-dev] [PATCH v3 3/4] netdev-dpdk: Remove useless cutlen.

2017-08-10 Thread Ilya Maximets
Cutlen already applied while processing OVS_ACTION_ATTR_OUTPUT.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/netdev-dpdk.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 8e3158f..ddcc574 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -1819,8 +1819,6 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct 
dp_packet_batch *batch)
 int newcnt = 0;
 int i;
 
-dp_packet_batch_apply_cutlen(batch);
-
 for (i = 0; i < batch->count; i++) {
 int size = dp_packet_size(batch->packets[i]);
 
@@ -1879,7 +1877,6 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
 dpdk_do_tx_copy(netdev, qid, batch);
 dp_packet_delete_batch(batch, true);
 } else {
-dp_packet_batch_apply_cutlen(batch);
 __netdev_dpdk_vhost_send(netdev, qid, batch->packets, batch->count);
 }
 return 0;
@@ -1910,8 +1907,6 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
 int cnt = batch->count;
 struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
 
-dp_packet_batch_apply_cutlen(batch);
-
 cnt = netdev_dpdk_filter_packet_len(dev, pkts, cnt);
 cnt = netdev_dpdk_qos_run(dev, pkts, cnt);
 dropped = batch->count - cnt;
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3 0/4] Output packet batching.

2017-08-10 Thread Ilya Maximets
This patch-set inspired by [1] from Bhanuprakash Bodireddy.
Implementation of [1] looks very complex and introduces many pitfalls [2]
for later code modifications like possible packet stucks.

This version targeted to make simple and flexible output packet batching on
higher level without introducing and even simplifying netdev layer.

Patch set consists of 3 patches. All the functionality introduced in the
first patch. Two others are just cleanups of netdevs to not do unnecessary
things.

4th patch is just an RFC with possible time based implementation.
Should not be concidered for performance testing.

Basic testing of 'PVP with OVS bonding on phy ports' scenario shows
significant performance improvement.
More accurate and intensive testing required.

[1] [PATCH v4 0/5] netdev-dpdk: Use intermediate queue during packet 
transmission.
https://mail.openvswitch.org/pipermail/ovs-dev/2017-August/337019.html

[2] For example:
https://mail.openvswitch.org/pipermail/ovs-dev/2017-August/337133.html

Version 3:

* Rebased on current master.
* Time based RFC: fixed assert on n_output_batches <= 0.

Version 2:

* Rebased on current master.
* Added time based batching RFC patch.
* Fixed mixing packets with different sources in same batch.

Ilya Maximets (4):
  dpif-netdev: Output packet batching.
  netdev: Remove unused may_steal.
  netdev-dpdk: Remove useless cutlen.
  dpif-netdev: Time based output batching.

 lib/dpif-netdev.c | 197 ++
 lib/netdev-bsd.c  |   4 +-
 lib/netdev-dpdk.c |  30 +++-
 lib/netdev-dummy.c|   4 +-
 lib/netdev-linux.c|   4 +-
 lib/netdev-provider.h |   7 +-
 lib/netdev.c  |  12 +--
 lib/netdev.h  |   2 +-
 vswitchd/vswitch.xml  |  15 
 9 files changed, 208 insertions(+), 67 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3 1/4] dpif-netdev: Output packet batching.

2017-08-10 Thread Ilya Maximets
While processing incoming batch of packets they are scattered
across many per-flow batches and sent separately.

This becomes an issue while using more than a few flows.

For example if we have balanced-tcp OvS bonding with 2 ports
there will be 256 datapath internal flows for each dp_hash
pattern. This will lead to scattering of a single recieved
batch across all of that 256 per-flow batches and invoking
send for each packet separately. This behaviour greatly degrades
overall performance of netdev_send because of inability to use
advantages of vectorized transmit functions.
But the half (if 2 ports in bonding) of datapath flows will
have the same output actions. This means that we can collect
them in a single place back and send at once using single call
to netdev_send. This patch introduces per-port packet batch
for output packets for that purpose.

'output_pkts' batch is thread local and located in send port cache.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/dpif-netdev.c | 104 ++
 1 file changed, 82 insertions(+), 22 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e2cd931..a2a25be 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -502,6 +502,7 @@ struct tx_port {
 int qid;
 long long last_used;
 struct hmap_node node;
+struct dp_packet_batch output_pkts;
 };
 
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
@@ -633,9 +634,10 @@ static void dp_netdev_execute_actions(struct 
dp_netdev_pmd_thread *pmd,
   size_t actions_len,
   long long now);
 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
-struct dp_packet_batch *, odp_port_t port_no);
+struct dp_packet_batch *, odp_port_t port_no,
+long long now);
 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
-  struct dp_packet_batch *);
+  struct dp_packet_batch *, long long now);
 
 static void dp_netdev_disable_upcall(struct dp_netdev *);
 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
@@ -667,6 +669,9 @@ static void dp_netdev_add_rxq_to_pmd(struct 
dp_netdev_pmd_thread *pmd,
 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
struct rxq_poll *poll)
 OVS_REQUIRES(pmd->port_mutex);
+static void
+dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
+   long long now);
 static void reconfigure_datapath(struct dp_netdev *dp)
 OVS_REQUIRES(dp->port_mutex);
 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
@@ -2809,6 +2814,7 @@ dpif_netdev_execute(struct dpif *dpif, struct 
dpif_execute *execute)
 struct dp_netdev *dp = get_dp_netdev(dpif);
 struct dp_netdev_pmd_thread *pmd;
 struct dp_packet_batch pp;
+long long now = time_msec();
 
 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
 dp_packet_size(execute->packet) > UINT16_MAX) {
@@ -2851,8 +2857,8 @@ dpif_netdev_execute(struct dpif *dpif, struct 
dpif_execute *execute)
 
 dp_packet_batch_init_packet(, execute->packet);
 dp_netdev_execute_actions(pmd, , false, execute->flow,
-  execute->actions, execute->actions_len,
-  time_msec());
+  execute->actions, execute->actions_len, now);
+dp_netdev_pmd_flush_output_packets(pmd, now);
 
 if (pmd->core_id == NON_PMD_CORE_ID) {
 ovs_mutex_unlock(>non_pmd_mutex);
@@ -3101,6 +3107,37 @@ cycles_count_intermediate(struct dp_netdev_pmd_thread 
*pmd,
 non_atomic_ullong_add(>cycles.n[type], interval);
 }
 
+static void
+dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
+   struct tx_port *p, long long now)
+{
+int tx_qid;
+bool dynamic_txqs;
+
+dynamic_txqs = p->port->dynamic_txqs;
+if (dynamic_txqs) {
+tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p, now);
+} else {
+tx_qid = pmd->static_tx_qid;
+}
+
+netdev_send(p->port->netdev, tx_qid, >output_pkts, true, dynamic_txqs);
+dp_packet_batch_init(>output_pkts);
+}
+
+static void
+dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
+   long long now)
+{
+struct tx_port *p;
+
+HMAP_FOR_EACH (p, node, >send_port_cache) {
+if (!dp_packet_batch_is_empty(>output_pkts)) {
+dp_netdev_pmd_flush_output_on_port(pmd, p, now);
+}
+}
+}
+
 static int
 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
struct netdev_rxq *rx,
@@ -3113,10 +3150,13 @@ dp_netdev

Re: [ovs-dev] [PATCH 1/2] dpif-netdev: Keep latest measured time for PMD thread.

2017-08-11 Thread Ilya Maximets
On 11.08.2017 11:13, Jan Scheurich wrote:
> Hi Ilya,
> 
> I fully agree with storing 'now' as part of the pmd struct instead of passing 
> it around as function arguments.
> 
> For me struct dp_netdev_pmd_thread is *the* the PMD thread context in 
> dpif-netdev. I don't really see the benefit of creating a sub-struct 
> dp_netdev_pmd_thread_ctx and moving certain data into that. Can you explain 
> your motivation?

Hello Jan,

IMHO all other fields in struct dp_netdev_pmd_thread has direct relation with 
the
thread itself (like core_id, need_reload) or they are it's direct accessories
(like mutexes, stats, caches, lists of polled ports). From the other hand, time 
and
last_cycles has no actual relation with thread and can be moved out of struct
dp_netdev_pmd_thread wihtout any logical issues. These fields are 
characteristics
of the outside environment. For example, 'emc_insert_min' which I'm introducing 
in
the next patch is actually the characteristics of the last received packet 
(maybe port)
but not the thread itself.

> 
> BR, Jan
> 
>> -Original Message-
>> From: ovs-dev-boun...@openvswitch.org 
>> [mailto:ovs-dev-boun...@openvswitch.org] On Behalf Of Ilya Maximets
>> Sent: Thursday, 10 August, 2017 18:55
>> To: ovs-dev@openvswitch.org
>> Cc: Ilya Maximets <i.maxim...@samsung.com>; Heetae Ahn 
>> <heetae82@samsung.com>
>> Subject: [ovs-dev] [PATCH 1/2] dpif-netdev: Keep latest measured time for 
>> PMD thread.
>>
>> In current implementation 'now' variable updated once on each
>> receive cycle and passed through the whole datapath via function
>> arguments. It'll be better to keep this variable inside PMD
>> thread structure to be able to get it at any time. Such solution
>> will save the stack memory and simplify possible modifications
>> in current logic.
>>
>> This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
>> contained by 'struct dp_netdev_pmd_thread' to store any processing
>> context of this PMD thread. For now, only time and cycles moved to
>> that structure. Can be extended in the future.
>>
>> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
>> ---
>>
>> Will be used in the next patch to not pass the probability
>> through the whole datapath.
>>
>>  lib/dpif-netdev.c | 117 
>> ++
>>  1 file changed, 65 insertions(+), 52 deletions(-)
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index e2cd931..6d42393 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -504,6 +504,16 @@ struct tx_port {
>>  struct hmap_node node;
>>  };
>>
>> +/* Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
>> +struct dp_netdev_pmd_thread_ctx {
>> +/* Latest measured time. */
>> +long long now;
>> +/* Used to count cycles. See 'cycles_count_end()' */
>> +unsigned long long last_cycles;
>> +};
>> +
>> +static void pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *);
>> +
>>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
>>   * the performance overhead of interrupt processing.  Therefore netdev can
>>   * not implement rx-wait for these devices.  dpif-netdev needs to poll
>> @@ -556,8 +566,8 @@ struct dp_netdev_pmd_thread {
>>  /* Cycles counters */
>>  struct dp_netdev_pmd_cycles cycles;
>>
>> -/* Used to count cicles. See 'cycles_counter_end()' */
>> -unsigned long long last_cycles;
>> +/* Current context of the PMD thread. */
>> +struct dp_netdev_pmd_thread_ctx ctx;
>>
>>  struct latch exit_latch;/* For terminating the pmd thread. */
>>  struct seq *reload_seq;
>> @@ -630,8 +640,7 @@ static void dp_netdev_execute_actions(struct 
>> dp_netdev_pmd_thread *pmd,
>>struct dp_packet_batch *,
>>bool may_steal, const struct flow 
>> *flow,
>>const struct nlattr *actions,
>> -  size_t actions_len,
>> -  long long now);
>> +  size_t actions_len);
>>  static void dp_netdev_input(struct dp_netdev_pmd_thread *,
>>  struct dp_packet_batch *, odp_port_t port_no);
>>  static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
>> @@ -679,9 +688,9 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread 
>> *pmd);
>>
>>  static v

Re: [ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-11 Thread Ilya Maximets
On 10.08.2017 21:52, Bodireddy, Bhanuprakash wrote:
>>>
>  } else {
> +/* If the queue is disabled in the guest, the 
> corresponding qid
> + * map shall be set to OVS_VHOST_QUEUE_DISABLED(-2).
> + *
> + * The packets that were queued in 'qid' could be 
> potentially
> + * stuck and needs to be dropped.
> + *
> + * XXX: The queues may be already disabled in the guest 
> so
> + * flush function in this case only helps in updating 
> stats
> + * and freeing memory.
> + */
> +netdev_dpdk_vhost_txq_flush(>up, qid, 0);
>  dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
>  }
>  netdev_dpdk_remap_txqs(dev);
>>>
>>> 'netdev_dpdk_remap_txqs()', actually, is able to change mapping for
>>> all the disabled in guest queues. So, we need to flush all of them
>>> while remapping somewhere inside the function.
>>> One other thing is that there is a race window between flush and
>>> mapping update where another process able to enqueue more packets in
>>> just flushed queue. The order of operations should be changed, or both
>>> of them should be done under the same tx_lock. I think, it's required
>>> to make tx_q[].map field atomic to fix the race condition, because
>>> send function takes the 'map' and then locks the corresponding queue.
>>> It wasn't an issue before, because packets in case of race was just
>>> dropped on attempt to send to disabled queue, but with this patch
>>> applied they will be enqueued to the intermediate queue and stuck there.
>>
>> Making 'map' atomic will not help. To solve the race we should make 'reading
>> of map + enqueue' an atomic operation by some spinlock.
>> Like this:
>>
>> vhost_send:
>> 
>>qid = qid % netdev->n_txq;
>>rte_spinlock_lock(>tx_q[qid].tx_lock);
>>
>>mapped_qid = dev->tx_q[qid].map;
>>
>>if (qid != mapped_qid) {
>>rte_spinlock_lock(>tx_q[mapped_qid].tx_lock);
>>}
>>
>>tx_enqueue(mapped_qid, pkts, cnt);
>>
>>if (qid != mapped_qid) {
>>rte_spinlock_unlock(>tx_q[mapped_qid].tx_lock);
>>}
>>
>>rte_spinlock_unlock(>tx_q[qid].tx_lock);
>> 
>>
>> txq remapping inside 'netdev_dpdk_remap_txqs()' or
>> 'vring_state_changed()':
>> 
>>qid - queue we need to remap.
>>new_qid - queue we need to remap to.
>>
>>rte_spinlock_lock(>tx_q[qid].tx_lock);
>>
>>mapped_qid = dev->tx_q[qid].map;
>>if (qid != mapped_qid) {
>>rte_spinlock_lock(>tx_q[mapped_qid].tx_lock);
>>}
>>
>>tx_flush(mapped_qid)
>>
>>if (qid != mapped_qid) {
>>rte_spinlock_unlock(>tx_q[mapped_qid].tx_lock);
>>}
>>
>>dev->tx_q[qid].map = new_qid;
>>
>>rte_spinlock_unlock(>tx_q[qid].tx_lock);
>> 
>>
>> Above schema should work without races, but looks kind of ugly and requires
>> taking of additional spinlock on each send.
>>
>> P.S. Sorry for talking with myself. Just want to share my thoughts.
> 
> Hi Ilya,
> 
> Can you please review the below changes based on what you suggested above. 
> As the problem only happens when the queues are enabled/disabled in the 
> guest, 
> I did some  preliminary testing with the below changes by sending some 
> traffic in to the VM
> and enabling and disabling the queues inside the guest the same time. 
> 
> Vhost_send()
> -
> qid = qid % netdev->n_txq;
> 
> /* Acquire tx_lock before reading tx_q[qid].map and enqueueing packets.
>  * tx_q[].map gets updated in vring_state_changed() when vrings are
>  * enabled/disabled in the guest. */
> rte_spinlock_lock(>tx_q[qid].tx_lock);
> 
> mapped_qid = dev->tx_q[qid].map;
> if (OVS_UNLIKELY(qid != mapped_qid)) {
> rte_spinlock_lock(>tx_q[mapped_qid].tx_lock);
> }
> 
> if (OVS_UNLIKELY(!is_vhost_running(dev) || mapped_qid < 0
>  || !(dev->flags & NETDEV_UP))) {
> rte_spinlock_lock(>stats_lock);
> dev->stats.tx_dropped+= cnt;
> rte_spinlock_unlock(>stats_lock);
> 
> for (i = 0; i < total_pkts; i++) {
> dp_packet_delete(pkts[i]);
> }
> 
> if (OVS_UNLIKELY(qid != mapped_qid)) {
> rte_spinlock_unlock(>tx_q[mapped_qid].tx_lock);
> }
> rte_spinlock_unlock(>tx_q[qid].tx_lock);
> 
> return;
> }
> 
> cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
> /* Check has QoS has been configured for the netdev */
> cnt = netdev_dpdk_qos_run(dev, 

Re: [ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-11 Thread Ilya Maximets
On 11.08.2017 16:11, Bodireddy, Bhanuprakash wrote:
>> On 09.08.2017 15:35, Bodireddy, Bhanuprakash wrote:
>
> +static int
> +netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid) {
> +struct dpdk_tx_queue *txq = >tx_q[qid];
> +struct rte_mbuf **cur_pkts = (struct rte_mbuf
> +**)txq->vhost_burst_pkts;
> +
> +int tx_vid = netdev_dpdk_get_vid(dev);
> +int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
> +uint32_t sent = 0;
> +uint32_t retries = 0;
> +uint32_t sum, total_pkts;
> +
> +total_pkts = sum = txq->vhost_pkt_cnt;
> +do {
> +uint32_t ret;
> +ret = rte_vhost_enqueue_burst(tx_vid, tx_qid,
> + _pkts[sent],
 sum);
> +if (OVS_UNLIKELY(!ret)) {
> +/* No packets enqueued - do not retry. */
> +break;
> +} else {
> +/* Packet have been sent. */
> +sent += ret;
> +
> +/* 'sum' packet have to be retransmitted. */
> +sum -= ret;
> +}
> +} while (sum && (retries++ < VHOST_ENQ_RETRY_NUM));
> +
> +for (int i = 0; i < total_pkts; i++) {
> +dp_packet_delete(txq->vhost_burst_pkts[i]);
> +}
> +
> +/* Reset pkt count. */
> +txq->vhost_pkt_cnt = 0;
> +
> +/* 'sum' refers to packets dropped. */
> +return sum;
> +}
> +
> +/* Flush the txq if there are any packets available. */ static int
> +netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
> +bool concurrent_txq OVS_UNUSED) {
> +struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +struct dpdk_tx_queue *txq;
> +
> +qid = dev->tx_q[qid % netdev->n_txq].map;
> +
> +/* The qid may be disabled in the guest and has been set to
> + * OVS_VHOST_QUEUE_DISABLED.
> + */
> +if (OVS_UNLIKELY(qid < 0)) {
> +return 0;
> +}
> +
> +txq = >tx_q[qid];
> +/* Increment the drop count and free the memory. */
> +if (OVS_UNLIKELY(!is_vhost_running(dev) ||
> + !(dev->flags & NETDEV_UP))) {
> +
> +if (txq->vhost_pkt_cnt) {
> +rte_spinlock_lock(>stats_lock);
> +dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
> +rte_spinlock_unlock(>stats_lock);
> +
> +for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
> +dp_packet_delete(txq->vhost_burst_pkts[i]);

 Spinlock (tx_lock) must be held here to avoid queue and mempool
>> breakage.
>>>
>>> I think you are right. tx_lock might be acquired for freeing the packets.
>>
>> I think that 'vhost_pkt_cnt' reads and updates also should be protected to
>> avoid races.
> 
> From the discussion in the thread 
> https://mail.openvswitch.org/pipermail/ovs-dev/2017-August/337133.html,
> We are going to acquire tx_lock for updating the map and flushing the queue 
> inside vring_state_changed(). 
> 
> That triggers a deadlock in the  flushing function as we have already 
> acquired the same lock in netdev_dpdk_vhost_txq_flush().
> This is the same problem for freeing the memory and protecting the updates to 
> vhost_pkt_cnt.
> 
>if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
>  rte_spinlock_lock(>tx_q[qid].tx_lock);
> netdev_dpdk_vhost_tx_burst(dev, qid);
> rte_spinlock_unlock(>tx_q[qid].tx_lock);
>}
> 
> As the problem is triggered when the guest queues are enabled/disabled, with 
> a small race window where packets can get enqueued in to the queue just after 
> the flush and before map value is updated in cb 
> function(vring_state_changed()), how abt this?
> 
> Technically as the queues are disabled, there is no point in flushing the 
> packets, so let's free the packets and set the txq->vhost_pkt_cnt in 
> vring_state_changed() itself instead of calling flush().

Technically, enabling case also should be handled, because while enabling
we're remapping the queue and, in some specific cases, I guess, the old
queue may be not used after remapping by the threads.

> 
> vring_state_changed().
> --
> rte_spinlock_lock(>tx_q[qid].tx_lock);
> 
> mapped_qid = dev->tx_q[qid].map;
>  if (OVS_UNLIKELY(qid != mapped_qid)) {
> rte_spinlock_lock(>tx_q[mapped_qid].tx_lock);
> }
> 
> if (enable) {
> dev->tx_q[qid].map = qid;
>   } else {
> struct dpdk_tx_queue *txq = >tx_q[qid];
> if (txq->vhost_pkt_cnt) {
> rte_spinlock_lock(>stats_lock);
> dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
> rte_spinlock_unlock(>stats_lock);
> 
> for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
> dp_packet_delete(txq->vhost_burst_pkts[i]);
>   

[ovs-dev] [PATCH] dpif-netdev: Fix per packet cycles statistics.

2017-08-11 Thread Ilya Maximets
DP_STAT_LOOKUP_HIT statistics used mistakenly for calculation
of total number of packets. This leads to completely wrong
per packet cycles statistics.

For example:

emc hits:0
megaflow hits:253702308
avg. subtable lookups per hit:1.50
miss:0
lost:0
avg cycles per packet: 248.32 (157498766585/634255770)

In this case 634255770 total_packets value used for avg
per packet calculation:

  total_packets = 'megaflow hits' + 'megaflow hits' * 1.5

The real value should be 524.38 (157498766585/253702308)

Fix that by summing only stats that reflects match/not match.
It's decided to make direct summing of required values instead of
disabling some stats in a loop to make calculations more clear and
avoid similar issues in the future.

CC: Jan Scheurich <jan.scheur...@ericsson.com>
Fixes: 3453b4d62a98 ("dpif-netdev: dpcls per in_port with sorted subtables")
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/dpif-netdev.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e2cd931..17e1666 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -755,7 +755,7 @@ pmd_info_show_stats(struct ds *reply,
 unsigned long long stats[DP_N_STATS],
 uint64_t cycles[PMD_N_CYCLES])
 {
-unsigned long long total_packets = 0;
+unsigned long long total_packets;
 uint64_t total_cycles = 0;
 int i;
 
@@ -771,13 +771,12 @@ pmd_info_show_stats(struct ds *reply,
 } else {
 stats[i] = 0;
 }
-
-if (i != DP_STAT_LOST) {
-/* Lost packets are already included in DP_STAT_MISS */
-total_packets += stats[i];
-}
 }
 
+/* Sum of all the matched and not matched packets gives the total.  */
+total_packets = stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT]
++ stats[DP_STAT_MISS];
+
 for (i = 0; i < PMD_N_CYCLES; i++) {
 if (cycles[i] > pmd->cycles_zero[i]) {
cycles[i] -= pmd->cycles_zero[i];
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 2/2] dpif-netdev: Per-port conditional EMC insert.

2017-08-10 Thread Ilya Maximets
Conditional EMC insert helps a lot in scenarios with high numbers
of parallel flows, but in current implementation this option affects
all the threads and ports at once. There are scenarios there we have
different number of flows on different ports. For example, if one
of the VMs encapsulates traffic using additional headers, it will
recieve large number of flows but only few flows will come out of
this VM. In this scenario it's much faster to use EMC instead of
classifier for traffic from the VM, but it's better to disable EMC
for the traffic which flows to VM.

To handle above issue 'emc-insert-inv-prob' was converted to per-port
option. Default value and behaviour kept as is.

For example, following command sets the insertion probability for
packets that came from port 'dpdk0' to ~1/20, i.e. ~5%:

  ovs-vsctl set interface dpdk0 other_config:emc-insert-inv-prob=20

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 Documentation/howto/dpdk.rst |   4 +-
 NEWS |   2 +-
 lib/dpif-netdev.c| 106 +--
 tests/pmd.at |   7 ++-
 vswitchd/vswitch.xml |  42 ++---
 5 files changed, 106 insertions(+), 55 deletions(-)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index d7f6610..c620961 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -389,9 +389,9 @@ EMC Insertion Probability
 -
 By default 1 in every 100 flows are inserted into the Exact Match Cache (EMC).
 It is possible to change this insertion probability by setting the
-``emc-insert-inv-prob`` option::
+``emc-insert-inv-prob`` option for the desired interface::
 
-$ ovs-vsctl --no-wait set Open_vSwitch . other_config:emc-insert-inv-prob=N
+$ ovs-vsctl set interface  other_config:emc-insert-inv-prob=N
 
 where:
 
diff --git a/NEWS b/NEWS
index 66eb936..a7bfdaf 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,6 @@
 Post-v2.8.0
 
-   - Nothing yet.
+   - EMC insertion probability turned to per-port other_config.
 
 v2.8.0 - xx xxx 
 -
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 6d42393..94e7bc4 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -266,9 +266,6 @@ struct dp_netdev {
 struct ovs_mutex meter_locks[N_METER_LOCKS];
 struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
 
-/* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
-OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
-
 /* Protects access to ofproto-dpif-upcall interface during revalidator
  * thread synchronization. */
 struct fat_rwlock upcall_rwlock;
@@ -364,6 +361,8 @@ struct dp_netdev_port {
 unsigned n_rxq; /* Number of elements in 'rxqs' */
 unsigned *txq_used; /* Number of threads that use each tx queue. */
 struct ovs_mutex txq_used_mutex;
+uint32_t emc_insert_min;/* Probability of EMC insertions is a factor
+ * of 'emc_insert_min'. */
 char *type; /* Port type as requested by user. */
 char *rxq_affinity_list;/* Requested affinity of rx queues. */
 };
@@ -487,6 +486,7 @@ struct dp_netdev_pmd_cycles {
 struct polled_queue {
 struct netdev_rxq *rx;
 odp_port_t port_no;
+uint32_t emc_insert_min;
 };
 
 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
@@ -508,6 +508,8 @@ struct tx_port {
 struct dp_netdev_pmd_thread_ctx {
 /* Latest measured time. */
 long long now;
+/* EMC insertion probability context for the current processing cycle. */
+uint32_t emc_insert_min;
 /* Used to count cycles. See 'cycles_count_end()' */
 unsigned long long last_cycles;
 };
@@ -1202,8 +1204,6 @@ create_dp_netdev(const char *name, const struct 
dpif_class *class,
 
 conntrack_init(>conntrack);
 
-atomic_init(>emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
-
 cmap_init(>poll_threads);
 
 ovs_mutex_init(>tx_qid_pool_mutex);
@@ -1485,6 +1485,7 @@ port_create(const char *devname, const char *type,
 port->netdev = netdev;
 port->type = xstrdup(type);
 port->sf = sf;
+port->emc_insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
 port->need_reconfigure = true;
 ovs_mutex_init(>txq_used_mutex);
 
@@ -2104,8 +2105,7 @@ emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
  * default the value is UINT32_MAX / 100 which yields an insertion
  * probability of 1/100 ie. 1% */
 
-uint32_t min;
-atomic_read_relaxed(>dp->emc_insert_min, );
+uint32_t min = pmd->ctx.emc_insert_min;
 
 if (min && random_uint32() <= min) {
 emc_insert(>flow_cache, key, flow);
@@ -2914,10 +2914,6 @@ dpif_netdev_set_config(struct dpif *dpif, const struct 
smap *other_config)
 {
 struct dp_netdev *dp = get_dp_netdev(dpif);
 const char *cmask = smap

[ovs-dev] [PATCH 1/2] dpif-netdev: Keep latest measured time for PMD thread.

2017-08-10 Thread Ilya Maximets
In current implementation 'now' variable updated once on each
receive cycle and passed through the whole datapath via function
arguments. It'll be better to keep this variable inside PMD
thread structure to be able to get it at any time. Such solution
will save the stack memory and simplify possible modifications
in current logic.

This patch introduces new structure 'dp_netdev_pmd_thread_ctx'
contained by 'struct dp_netdev_pmd_thread' to store any processing
context of this PMD thread. For now, only time and cycles moved to
that structure. Can be extended in the future.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---

Will be used in the next patch to not pass the probability
through the whole datapath.

 lib/dpif-netdev.c | 117 ++
 1 file changed, 65 insertions(+), 52 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e2cd931..6d42393 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -504,6 +504,16 @@ struct tx_port {
 struct hmap_node node;
 };
 
+/* Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
+struct dp_netdev_pmd_thread_ctx {
+/* Latest measured time. */
+long long now;
+/* Used to count cycles. See 'cycles_count_end()' */
+unsigned long long last_cycles;
+};
+
+static void pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *);
+
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
  * the performance overhead of interrupt processing.  Therefore netdev can
  * not implement rx-wait for these devices.  dpif-netdev needs to poll
@@ -556,8 +566,8 @@ struct dp_netdev_pmd_thread {
 /* Cycles counters */
 struct dp_netdev_pmd_cycles cycles;
 
-/* Used to count cicles. See 'cycles_counter_end()' */
-unsigned long long last_cycles;
+/* Current context of the PMD thread. */
+struct dp_netdev_pmd_thread_ctx ctx;
 
 struct latch exit_latch;/* For terminating the pmd thread. */
 struct seq *reload_seq;
@@ -630,8 +640,7 @@ static void dp_netdev_execute_actions(struct 
dp_netdev_pmd_thread *pmd,
   struct dp_packet_batch *,
   bool may_steal, const struct flow *flow,
   const struct nlattr *actions,
-  size_t actions_len,
-  long long now);
+  size_t actions_len);
 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
 struct dp_packet_batch *, odp_port_t port_no);
 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
@@ -679,9 +688,9 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread 
*pmd);
 
 static void
 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
-   long long now, bool purge);
+   bool purge);
 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
-  struct tx_port *tx, long long now);
+  struct tx_port *tx);
 
 static inline bool emc_entry_alive(struct emc_entry *ce);
 static void emc_clear_entry(struct emc_entry *ce);
@@ -723,6 +732,12 @@ emc_cache_slow_sweep(struct emc_cache *flow_cache)
 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
 }
 
+static inline void
+pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
+{
+pmd->ctx.now = time_msec();
+}
+
 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
 bool
 dpif_is_netdev(const struct dpif *dpif)
@@ -2839,6 +2854,9 @@ dpif_netdev_execute(struct dpif *dpif, struct 
dpif_execute *execute)
 ovs_mutex_lock(>non_pmd_mutex);
 }
 
+/* Update current time in PMD context. */
+pmd_thread_ctx_time_update(pmd);
+
 /* The action processing expects the RSS hash to be valid, because
  * it's always initialized at the beginning of datapath processing.
  * In this case, though, 'execute->packet' may not have gone through
@@ -2851,8 +2869,7 @@ dpif_netdev_execute(struct dpif *dpif, struct 
dpif_execute *execute)
 
 dp_packet_batch_init_packet(, execute->packet);
 dp_netdev_execute_actions(pmd, , false, execute->flow,
-  execute->actions, execute->actions_len,
-  time_msec());
+  execute->actions, execute->actions_len);
 
 if (pmd->core_id == NON_PMD_CORE_ID) {
 ovs_mutex_unlock(>non_pmd_mutex);
@@ -3073,7 +3090,7 @@ cycles_count_start(struct dp_netdev_pmd_thread *pmd)
 OVS_ACQUIRES(_counter_fake_mutex)
 OVS_NO_THREAD_SAFETY_ANALYSIS
 {
-pmd->last_cycles = cycles_counter();
+pmd->ctx.last_cycles = cycles_counter();
 }
 
 /* Stop counting cycles and add them to the counter 

[ovs-dev] [PATCH 0/2] Per-port EMC insertion probability.

2017-08-10 Thread Ilya Maximets
Ilya Maximets (2):
  dpif-netdev: Keep latest measured time for PMD thread.
  dpif-netdev: Per-port conditional EMC insert.

 Documentation/howto/dpdk.rst |   4 +-
 NEWS |   2 +-
 lib/dpif-netdev.c| 223 ++-
 tests/pmd.at |   7 +-
 vswitchd/vswitch.xml |  42 
 5 files changed, 171 insertions(+), 107 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] dp-packet: Introduce dp_packet_batch_add_unsafe().

2017-08-11 Thread Ilya Maximets
On 09.08.2017 17:19, Andy Zhou wrote:
>> Maybe I don't fully understand what you're trying to say, but I want to use
>> unsafe function in dpif-netdev for per-flow packet batching (see the patch)
>> and it should not be internal for that case.
>> (It's safe to use unsafe function there because per-flow batches are
>> guaranteed to be less than original rx batch.)
> 
> Let me try again.  As you ave pointed out, it is not really unsafe to
> use the function
> at those calling sites, I think dp_packet_batch_add__() is a suitable
> name. In case
> you prefer a different function name, that's fine too, I just don't
> think unsafe is accurate.

Oh. What I'm trying to achieve:

1. Eliminate direct usages of batch internal structure.
2. Don't check the boundaries in
   lib/dpif-netdev.c:packet_batch_per_flow_update().

The second goal requires existence of the exposed to the end users
function, which will not check the boundaries while adding packet to batch.
dp_packet_batch_add__() is not suitable in this case, because it has
"__" as a suffix and should not be exposed to the end users because of this.

Could you suggest another dp_packet_batch_XXX() name (which can be exposed
to the end user) for the function that doesn't check the boundaries instead
of 'add_unsafe', if you think it is not accurate?


Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH RFC v3 4/4] dpif-netdev: Time based output batching.

2017-08-14 Thread Ilya Maximets
On 14.08.2017 02:33, Jan Scheurich wrote:
>> This allows to collect packets from more than one RX burst
>> and send them together with a configurable maximum latency.
>>
>> 'other_config:output-max-latency' can be used to configure
>> time that a packet can wait in output batch for sending.
>>
>> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
>> ---
>> Notes:
>>
>> * This is an RFC and should not be used for performance testing.
>> * Millisecond granularity is used for now. Can be easily switched
>>   to use microseconds instead.
> 
>>From earlier in-house trials we know we need to target flush times of 50 us 
>>or less, so we clearly need better time resolution. Sub-ms timing in PMD 
>>should be based on TSC cycles, which are already kept in the pmd struct. 
>>Could you provide a corresponding patch for performance testing?

I don't think that TSC is suitable in this case. Some reasons:

* non-PMD threads are able to float across cpu cores.
* Turbo-boost can be enabled or frequency can be adjusted manually after DPDK 
init.
* TSC cycles only calculated if DPDK enabled.

TSC is used currently only for not really precise statistics.
For the real features we need more accurate time accounting.

I believe that CLOCK_MONOTONIC is able to provide at least microsecond
granularity on the most of systems. We just need to add one more wrapper
function like 'time_usec()' to the lib/timeval.

I'll send corresponding WIP incremental patches in reply to this message.

> 
>>
>>  lib/dpif-netdev.c| 121
>> ++-
>>  vswitchd/vswitch.xml |  15 +++
>>  2 files changed, 115 insertions(+), 21 deletions(-)
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index dcf55f3..0d78ae4 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -85,6 +85,9 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>>  #define MAX_RECIRC_DEPTH 5
>>  DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
>>
>> +/* Use instant packet send by default. */
>> +#define DEFAULT_OUTPUT_MAX_LATENCY 0
>> +
>>  /* Configuration parameters. */
>>  enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow
>> table. */
>>  enum { MAX_METERS = 65536 };/* Maximum number of meters. */
>> @@ -262,6 +265,9 @@ struct dp_netdev {
>>  struct hmap ports;
>>  struct seq *port_seq;   /* Incremented whenever a port changes. */
>>
>> +/* The time that a packet can wait in output batch for sending. */
>> +atomic_uint32_t output_max_latency;
>> +
>>  /* Meters. */
>>  struct ovs_mutex meter_locks[N_METER_LOCKS];
>>  struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
>> @@ -502,6 +508,7 @@ struct tx_port {
>>  int qid;
>>  long long last_used;
>>  struct hmap_node node;
>> +long long output_time;
> 
> Rename to flush_time?

Maybe. Why do you think it's better?

> 
>>  struct dp_packet_batch output_pkts;
>>  };
>>
>> @@ -574,6 +581,9 @@ struct dp_netdev_pmd_thread {
>>   * than 'cmap_count(dp->poll_threads)'. */
>>  uint32_t static_tx_qid;
>>
>> +/* Number of filled output batches. */
>> +int n_output_batches;
>> +
>>  struct ovs_mutex port_mutex;/* Mutex for 'poll_list' and 'tx_ports'.
>> */
>>  /* List of rx queues to poll. */
>>  struct hmap poll_list OVS_GUARDED;
>> @@ -669,9 +679,9 @@ static void dp_netdev_add_rxq_to_pmd(struct
>> dp_netdev_pmd_thread *pmd,
>>  static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread
>> *pmd,
>> struct rxq_poll *poll)
>>  OVS_REQUIRES(pmd->port_mutex);
>> -static void
>> +static int
>>  dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread
>> *pmd,
>> -   long long now);
>> +   long long now, bool force);
>>  static void reconfigure_datapath(struct dp_netdev *dp)
>>  OVS_REQUIRES(dp->port_mutex);
>>  static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
>> @@ -1193,6 +1203,7 @@ create_dp_netdev(const char *name, const
>> struct dpif_class *class,
>>  conntrack_init(>conntrack);
>>
>>  atomic_init(>emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
>> +atomic_init(>output_max_latency,
>> DEFAULT_OUTPUT_MAX_LATENCY);
>>
>>  cmap_init(>poll_threads);
>>
>> @@ -2858,7 +2869,7 @@ dpif_netdev_execute(struct dpif *dpif,

[ovs-dev] [PATCH RFC 2/2] dpif-netdev: Use microseconds granularity for output-max-latency.

2017-08-14 Thread Ilya Maximets
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/dpif-netdev.c| 16 +---
 vswitchd/vswitch.xml |  5 +++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 0d78ae4..cf1591c 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2825,7 +2825,7 @@ dpif_netdev_execute(struct dpif *dpif, struct 
dpif_execute *execute)
 struct dp_netdev *dp = get_dp_netdev(dpif);
 struct dp_netdev_pmd_thread *pmd;
 struct dp_packet_batch pp;
-long long now = time_msec();
+long long now = time_usec();
 
 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
 dp_packet_size(execute->packet) > UINT16_MAX) {
@@ -2925,7 +2925,7 @@ dpif_netdev_set_config(struct dpif *dpif, const struct 
smap *other_config)
 atomic_read_relaxed(>output_max_latency, _max_latency);
 if (output_max_latency != cur_max_latency) {
 atomic_store_relaxed(>output_max_latency, output_max_latency);
-VLOG_INFO("Output maximum latency set to %"PRIu32" ms",
+VLOG_INFO("Output maximum latency set to %"PRIu32" us",
   output_max_latency);
 }
 
@@ -3166,7 +3166,7 @@ dp_netdev_pmd_flush_output_packets(struct 
dp_netdev_pmd_thread *pmd,
 }
 
 if (!now) {
-now = time_msec();
+now = time_usec();
 }
 
 HMAP_FOR_EACH (p, node, >send_port_cache) {
@@ -3190,7 +3190,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 dp_packet_batch_init();
 error = netdev_rxq_recv(rx, );
 if (!error) {
-long long now = time_msec();
+long long now = time_usec();
 
 *recirc_depth_get() = 0;
 
@@ -3768,7 +3768,7 @@ dpif_netdev_run(struct dpif *dpif)
 }
 
 cycles_count_end(non_pmd, PMD_CYCLES_IDLE);
-dpif_netdev_xps_revalidate_pmd(non_pmd, time_msec(), false);
+dpif_netdev_xps_revalidate_pmd(non_pmd, time_usec(), false);
 ovs_mutex_unlock(>non_pmd_mutex);
 
 dp_netdev_pmd_unref(non_pmd);
@@ -4742,7 +4742,7 @@ packet_batch_per_flow_execute(struct 
packet_batch_per_flow *batch,
 struct dp_netdev_flow *flow = batch->flow;
 
 dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
-batch->tcp_flags, now);
+batch->tcp_flags, now / 1000);
 
 actions = dp_netdev_flow_get_actions(flow);
 
@@ -5111,7 +5111,7 @@ dpif_netdev_xps_revalidate_pmd(const struct 
dp_netdev_pmd_thread *pmd,
 if (!tx->port->dynamic_txqs) {
 continue;
 }
-interval = now - tx->last_used;
+interval = now / 1000 - tx->last_used;
 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT_MS)) {
 port = tx->port;
 ovs_mutex_lock(>txq_used_mutex);
@@ -5132,6 +5132,8 @@ dpif_netdev_xps_get_tx_qid(const struct 
dp_netdev_pmd_thread *pmd,
 
 if (OVS_UNLIKELY(!now)) {
 now = time_msec();
+} else {
+now /= 1000;
 }
 
 interval = now - tx->last_used;
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 23930f0..1c6ae7c 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -345,9 +345,10 @@
   
 
   
+  type='{"type": "integer",
+ "minInteger": 0, "maxInteger": 100}'>
 
-  Specifies the time in milliseconds that a packet can wait in output
+  Specifies the time in microseconds that a packet can wait in output
   batch for sending i.e. amount of time that packet can spend in an
   intermediate output queue before sending to netdev.
   This option can be used to configure balance between throughput
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH RFC 1/2] timeval: Introduce time_usec().

2017-08-14 Thread Ilya Maximets
This function will provide monotonic time in microseconds.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/timeval.c | 22 ++
 lib/timeval.h |  2 ++
 2 files changed, 24 insertions(+)

diff --git a/lib/timeval.c b/lib/timeval.c
index dd63f03..be2eddc 100644
--- a/lib/timeval.c
+++ b/lib/timeval.c
@@ -233,6 +233,22 @@ time_wall_msec(void)
 return time_msec__(_clock);
 }
 
+static long long int
+time_usec__(struct clock *c)
+{
+struct timespec ts;
+
+time_timespec__(c, );
+return timespec_to_usec();
+}
+
+/* Returns a monotonic timer, in microseconds. */
+long long int
+time_usec(void)
+{
+return time_usec__(_clock);
+}
+
 /* Configures the program to die with SIGALRM 'secs' seconds from now, if
  * 'secs' is nonzero, or disables the feature if 'secs' is zero. */
 void
@@ -360,6 +376,12 @@ timeval_to_msec(const struct timeval *tv)
 return (long long int) tv->tv_sec * 1000 + tv->tv_usec / 1000;
 }
 
+long long int
+timespec_to_usec(const struct timespec *ts)
+{
+return (long long int) ts->tv_sec * 1000 * 1000 + ts->tv_nsec / 1000;
+}
+
 /* Returns the monotonic time at which the "time" module was initialized, in
  * milliseconds. */
 long long int
diff --git a/lib/timeval.h b/lib/timeval.h
index 7957dad..8e74551 100644
--- a/lib/timeval.h
+++ b/lib/timeval.h
@@ -54,6 +54,7 @@ time_t time_now(void);
 time_t time_wall(void);
 long long int time_msec(void);
 long long int time_wall_msec(void);
+long long int time_usec(void);
 void time_timespec(struct timespec *);
 void time_wall_timespec(struct timespec *);
 void time_alarm(unsigned int secs);
@@ -62,6 +63,7 @@ int time_poll(struct pollfd *, int n_pollfds, HANDLE *handles,
 
 long long int timespec_to_msec(const struct timespec *);
 long long int timeval_to_msec(const struct timeval *);
+long long int timespec_to_usec(const struct timespec *);
 
 struct tm_msec *localtime_msec(long long int now, struct tm_msec *result);
 struct tm_msec *gmtime_msec(long long int now, struct tm_msec *result);
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [patch_v2 2/3] dpif-netdev: Refactor some pmd stats.

2017-08-14 Thread Ilya Maximets
> The per packets stats are presently overlapping in that
> miss stats include lost stats; make these stats non-overlapping
> for clarity and make this clear in the dp_stat_type enum.  This
> also eliminates the need to increment two 'miss' stats for a
> single packet.
> 
> The subtable lookup stats is renamed to make it
> clear that it relates to masked lookups.
> 
> The stats that total to the number of packets seen are defined
> in dp_stat_type and an api is created to total the stats in case
> these stats are further divided in the future.
> 
> Signed-off-by: Darrell Ball 
> ---
>  lib/dpif-netdev.c | 58 
> ---
>  1 file changed, 38 insertions(+), 20 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 17e1666..38f5203 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -323,12 +323,21 @@ static struct dp_netdev_port 
> *dp_netdev_lookup_port(const struct dp_netdev *dp,
>  OVS_REQUIRES(dp->port_mutex);
>  
>  enum dp_stat_type {
> -DP_STAT_EXACT_HIT,  /* Packets that had an exact match (emc). */
> -DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
> -DP_STAT_MISS,   /* Packets that did not match. */
> -DP_STAT_LOST,   /* Packets not passed up to the client. */
> -DP_STAT_LOOKUP_HIT, /* Number of subtable lookups for flow table
> -   hits */
> +DP_STAT_EXACT_HIT,  /* Packets that had an exact match (emc). */
> +DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
> +DP_STAT_MISS,   /* Packets that did not match and were passed
> +   up to the client. */

This definition of 'DP_STAT_MISS' looks illogical for me. 'MISS' means
the cache miss (EMC or CLS). But with the new definition the name became
meaningless.
Also, there is some informal concept in OVS: 'execute a miss' which
means the upcall execution. And users expects the miss counter to reflect
the number of upcalls executed.

Maybe we can define something like DP_N_CACHE_STAT to split the cache
(EMC and CLS) hit/miss stats from others?

> +DP_STAT_LOST,   /* Packets that did not match and were not
> +   passed up to client. */
> +DP_N_TOT_PKT_STAT,  /* The above statistics account for the total
> +   number of packets seen and should be
> +   non overlapping with each other. */
> +DP_STAT_MASKED_LOOKUP_HIT = DP_N_TOT_PKT_STAT,  /* Number of subtable
> +   lookups for flow table
> +   hits. Each MASKED_HIT
> +   hit will have >= 1
> +   MASKED_LOOKUP_HIT
> +   hit(s). */

Do we need the '_HIT' prefix for DP_STAT_MASKED_LOOKUP_HIT?
This kind of strange name because we're counting not hits but lookups.


>  DP_N_STATS
>  };
>  
> @@ -749,13 +758,22 @@ enum pmd_info_type {
>  PMD_INFO_SHOW_RXQ /* Show poll-lists of pmd threads. */
>  };
>  
> +static unsigned long long
> +dp_netdev_calcul_total_packets(unsigned long long *stats)
> +{
> +unsigned long long total_packets = 0;
> +for (uint8_t i = 0; i < DP_N_TOT_PKT_STAT; i++) {
> +total_packets += stats[i];
> +}
> +return total_packets;
> +}
> +
>  static void
>  pmd_info_show_stats(struct ds *reply,
>  struct dp_netdev_pmd_thread *pmd,
>  unsigned long long stats[DP_N_STATS],
>  uint64_t cycles[PMD_N_CYCLES])
>  {
> -unsigned long long total_packets;
>  uint64_t total_cycles = 0;
>  int i;
>  
> @@ -773,10 +791,6 @@ pmd_info_show_stats(struct ds *reply,
>  }
>  }
>  
> -/* Sum of all the matched and not matched packets gives the total.  */
> -total_packets = stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT]
> -+ stats[DP_STAT_MISS];
> -
>  for (i = 0; i < PMD_N_CYCLES; i++) {
>  if (cycles[i] > pmd->cycles_zero[i]) {
> cycles[i] -= pmd->cycles_zero[i];
> @@ -804,7 +818,8 @@ pmd_info_show_stats(struct ds *reply,
>"\tmiss:%llu\n\tlost:%llu\n",
>stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
>stats[DP_STAT_MASKED_HIT] > 0
> -  ? (1.0*stats[DP_STAT_LOOKUP_HIT])/stats[DP_STAT_MASKED_HIT]
> +  ? (1.0 * stats[DP_STAT_MASKED_LOOKUP_HIT])
> + / stats[DP_STAT_MASKED_HIT]
>: 0,
>stats[DP_STAT_MISS], stats[DP_STAT_LOST]);
>  
> @@ -820,6 +835,9 @@ pmd_info_show_stats(struct ds *reply,
>cycles[PMD_CYCLES_PROCESSING],
>cycles[PMD_CYCLES_PROCESSING] / 

Re: [ovs-dev] [PATCH RFC v3 4/4] dpif-netdev: Time based output batching.

2017-08-14 Thread Ilya Maximets
On 14.08.2017 16:12, Jan Scheurich wrote:
>>> >From earlier in-house trials we know we need to target flush times of 50
>> us or less, so we clearly need better time resolution. Sub-ms timing in PMD
>> should be based on TSC cycles, which are already kept in the pmd struct.
>> Could you provide a corresponding patch for performance testing?
>>
>> I don't think that TSC is suitable in this case. Some reasons:
>>
>> * non-PMD threads are able to float across cpu cores.
>> * Turbo-boost can be enabled or frequency can be adjusted manually after
>> DPDK init.
>> * TSC cycles only calculated if DPDK enabled.
>>
>> TSC is used currently only for not really precise statistics.
>> For the real features we need more accurate time accounting.
>>
>> I believe that CLOCK_MONOTONIC is able to provide at least microsecond
>> granularity on the most of systems. We just need to add one more wrapper
>> function like 'time_usec()' to the lib/timeval.
>>
> 
> We have tested the effect of turbo mode on TSC and there is none. The TSC 
> frequency remains at the nominal clock speed, no matter if the core is 
> clocked down or up. So, I believe for PMD threads (where performance matters) 
> TSC would be an adequate and efficient clock.

It's highly platform dependent and testing on a few systems doesn't guarantee 
anything.
>From the other hand POSIX guarantee the monotonic characteristics for 
>CLOCK_MONOTONIC.

> 
> On PMDs I am a bit concerned about the overhead/latency introduced with the 
> clock_gettime() system call, but I haven't done any measurements to check the 
> actual impact. Have you?

Have you seen my incremental patches?
There is no overhead, because we're just replacing 'time_msec' with 'time_usec'.
No difference except converting timespec to usec instead of msec.

> 
> If we go for CLOCK_MONOTONIC in microsecond resolution, we should make sure 
> that the clock is read not more than once once every iteration (and cache the 
> us value as now in the pmd ctx struct as suggested in your other patch). But 
> then for consistency also the XPS feature should use the PMD time in us 
> resolution.

Again, please, look at my incremental patches.

> 
> For non-PMD thread we could actually skip time-based output batching 
> completely. The packet rates and the frequency of calls to dpif_netdev_run() 
> in the main ovs-vswitchd thread are so low that time-based flushing doesn't 
> seem to make much sense.
> 
> Below you can find an alternative incremental patch on top of your RFC 4/4 
> that uses TSC on PMD. We will be comparing the two alternatives for 
> performance both with non-PMD guests (iperf3) as well as PMD guests (DPDK 
> testpmd).

In your version you need to move all the output_batching related code
under #ifdef DPDK_NETDEV because it will brake userspace networking if
compiled without dpdk and output-max-latency != 0.

> 
> BR, Jan
> 
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 0d78ae4..8285786 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -265,7 +265,7 @@ struct dp_netdev {
>  struct hmap ports;
>  struct seq *port_seq;   /* Incremented whenever a port changes. */
>  
> -/* The time that a packet can wait in output batch for sending. */
> +/* Time in cycles that a packet can wait in output batch for sending. */
>  atomic_uint32_t output_max_latency;
>  
>  /* Meters. */
> @@ -508,7 +508,7 @@ struct tx_port {
>  int qid;
>  long long last_used;
>  struct hmap_node node;
> -long long output_time;
> +long long flush_time;   /* Time in LSC cycles to flush output buffer. */
>  struct dp_packet_batch output_pkts;
>  };
>  
> @@ -622,6 +622,7 @@ struct dpif_netdev {
>  uint64_t last_port_seq;
>  };
>  
> +static inline unsigned long cycles_per_microsecond(void);
>  static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
>struct dp_netdev_port **portp)
>  OVS_REQUIRES(dp->port_mutex);
> @@ -2921,11 +2922,12 @@ dpif_netdev_set_config(struct dpif *dpif, const 
> struct smap *other_config)
>  uint32_t output_max_latency, cur_max_latency;
>  
>  output_max_latency = smap_get_int(other_config, "output-max-latency",
> -  DEFAULT_OUTPUT_MAX_LATENCY);
> +  DEFAULT_OUTPUT_MAX_LATENCY)
> + * cycles_per_microsecond();
>  atomic_read_relaxed(>output_max_latency, _max_latency);
>  if (output_max_latency != cur_max_latency) {
>  atomic_store_relaxed(>output_max_latency, output_max_latency);
> -VLOG_INFO("Output maximum latency set to %"PRIu32" ms",
> +VLOG_INFO("Output maximum latency set to %"PRIu32" cycles",
>output_max_latency);
>  }
>  
> @@ -3091,6 +3093,16 @@ cycles_counter(void)
>  #endif
>  }
>  
> +static inline unsigned long
> +cycles_per_microsecond(void)
> +{
> +#ifdef DPDK_NETDEV
> +return rte_get_tsc_hz() / 

Re: [ovs-dev] [patch_v4 2/2] dpif-netdev: Refactor some pmd stats.

2017-08-16 Thread Ilya Maximets
> The per packets stats are presently overlapping in that
> miss stats include lost stats; make these stats non-overlapping
> for clarity and make this clear in the dp_stat_type enum.  This
> also eliminates the need to increment two 'miss' stats for a
> single packet.
> 
> The subtable lookup stats is renamed to make it
> clear that it relates to masked lookups.
> The stats that total to the number of packets seen are defined
> in dp_stat_type and an api is created to total the stats in case
> these stats are further divided in the future.
> 
> The pmd stats test is enhanced to include megaflow stats
> counting and checking.
> Also, miss and lost stats are annotated to make it clear
> what they mean.
> 
> Signed-off-by: Darrell Ball 
> ---
>  lib/dpif-netdev.c | 78 
> ++-
>  tests/pmd.at  | 31 +-
>  2 files changed, 74 insertions(+), 35 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 17e1666..dfc6684 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -323,12 +323,19 @@ static struct dp_netdev_port 
> *dp_netdev_lookup_port(const struct dp_netdev *dp,
>  OVS_REQUIRES(dp->port_mutex);
>  
>  enum dp_stat_type {
> -DP_STAT_EXACT_HIT,  /* Packets that had an exact match (emc). */
> -DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
> -DP_STAT_MISS,   /* Packets that did not match. */
> -DP_STAT_LOST,   /* Packets not passed up to the client. */
> -DP_STAT_LOOKUP_HIT, /* Number of subtable lookups for flow table
> -   hits */
> +DP_STAT_EXACT_HIT,  /* Packets that had an exact match (emc). */
> +DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
> +DP_STAT_MISS,   /* Packets that did not match and upcall was
> +   done. */
> +DP_STAT_LOST,   /* Packets that did not match and upcall was
> +   not done. */

Packets not passed up to the client and packets for which upcall wasn't executed
are different sets. Overlapping but different.
See below.

> +DP_N_PER_PKT_CNT,   /* The above statistics account for the total
> +   number of packets seen and should not be
> +   overlapping with each other. */
> +DP_STAT_MASKED_LOOKUP = DP_N_PER_PKT_CNT,  /* Number of subtable lookups
> +  for flow table hits. Each
> +  MASKED_HIT hit will have
> +  >= 1 MASKED_LOOKUP(s). */
>  DP_N_STATS
>  };
>  
> @@ -749,13 +756,22 @@ enum pmd_info_type {
>  PMD_INFO_SHOW_RXQ /* Show poll-lists of pmd threads. */
>  };
>  
> +static unsigned long long
> +dp_netdev_calcul_total_packets(unsigned long long *stats)
> +{
> +unsigned long long total_packets = 0;
> +for (uint8_t i = 0; i < DP_N_PER_PKT_CNT; i++) {
> +total_packets += stats[i];
> +}
> +return total_packets;
> +}
> +
>  static void
>  pmd_info_show_stats(struct ds *reply,
>  struct dp_netdev_pmd_thread *pmd,
>  unsigned long long stats[DP_N_STATS],
>  uint64_t cycles[PMD_N_CYCLES])
>  {
> -unsigned long long total_packets;
>  uint64_t total_cycles = 0;
>  int i;
>  
> @@ -773,10 +789,6 @@ pmd_info_show_stats(struct ds *reply,
>  }
>  }
>  
> -/* Sum of all the matched and not matched packets gives the total.  */
> -total_packets = stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT]
> -+ stats[DP_STAT_MISS];
> -
>  for (i = 0; i < PMD_N_CYCLES; i++) {
>  if (cycles[i] > pmd->cycles_zero[i]) {
> cycles[i] -= pmd->cycles_zero[i];
> @@ -800,11 +812,12 @@ pmd_info_show_stats(struct ds *reply,
>  
>  ds_put_format(reply,
>"\temc hits:%llu\n\tmegaflow hits:%llu\n"
> -  "\tavg. subtable lookups per hit:%.2f\n"
> -  "\tmiss:%llu\n\tlost:%llu\n",
> +  "\tavg. subtable lookups per megaflow hit:%.2f\n"
> +  "\tmiss(upcall done):%llu\n\tlost(upcall not done):%llu\n",
>stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
>stats[DP_STAT_MASKED_HIT] > 0
> -  ? (1.0*stats[DP_STAT_LOOKUP_HIT])/stats[DP_STAT_MASKED_HIT]
> +  ? (1.0 * stats[DP_STAT_MASKED_LOOKUP])
> + / stats[DP_STAT_MASKED_HIT]
>: 0,
>stats[DP_STAT_MISS], stats[DP_STAT_LOST]);
>  
> @@ -820,6 +833,9 @@ pmd_info_show_stats(struct ds *reply,
>cycles[PMD_CYCLES_PROCESSING],
>cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 
> 100);
>  
> +/* Sum of all the matched and not matched 

Re: [ovs-dev] [PATCH] odp-execute: Reuse rss hash in OVS_ACTION_ATTR_HASH.

2017-07-13 Thread Ilya Maximets
On 12.07.2017 10:15, Andy Zhou wrote:
> On Tue, Jul 11, 2017 at 7:30 AM, Ilya Maximets <i.maxim...@samsung.com> wrote:
>> If RSS hash exists in a packet it can be reused instead of
>> 5 tuple hash re-calculation in OVS_ACTION_ATTR_HASH. This
>> leads to increasing the performance of sending packets to
>> the OVS bonding in userspace datapath up to 10-15%.
>>
>> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
>> ---
>>  lib/odp-execute.c | 13 +++--
>>  1 file changed, 11 insertions(+), 2 deletions(-)
>>
>> diff --git a/lib/odp-execute.c b/lib/odp-execute.c
>> index d656334..471a364 100644
>> --- a/lib/odp-execute.c
>> +++ b/lib/odp-execute.c
>> @@ -646,8 +646,17 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
>> *batch, bool steal,
>>  uint32_t hash;
>>
>>  DP_PACKET_BATCH_FOR_EACH (packet, batch) {
>> -flow_extract(packet, );
>> -hash = flow_hash_5tuple(, hash_act->hash_basis);
>> +/* RSS hash can be used here instead of 5tuple for
>> + * performance reasons. */
>> +if (dp_packet_rss_valid(packet)) {
>> +hash = dp_packet_get_rss_hash(packet);
> 
>> +if (hash_act->hash_basis) {
>> +hash = hash_finish(hash, hash_act->hash_basis);
>> +}
> 
> This is not a full review. I have some comments on the 3 lines of code above.
> 
> Would it make more sense to always include 'hash_basis'?  Also it
> seems hash_int() would be more appropriate than hash_finish().
> 
> I suppose the performance gain may not be as significant. On the other
> hand, I am not sure if we should count on the performance
> gain by assuming hash_basis is always zero.

I performed few tests with set and not set hash basis and found
no significant performance difference. Also I checked your
suggestion with the following incremental applied:
-
-if (hash_act->hash_basis) {
-hash = hash_finish(hash, hash_act->hash_basis);
-}
+hash = hash_int(hash, hash_act->hash_basis);
-

This also doesn't affect performance significantly.

I agree that code without assumptions on hash_basis value looks more correct.
So, I will send v2 with that change. Also, I fixed one unit test that depends
on dp_hash value. I'll include this fix in v2 too.

> 
>> +} else {
>> +flow_extract(packet, );
>> +hash = flow_hash_5tuple(, 
>> hash_act->hash_basis);
>> +}
>>  packet->md.dp_hash = hash;
>>  }
>>  } else {
>> --
>> 2.7.4
>>
> 
> 
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] odp-execute: Reuse rss hash in OVS_ACTION_ATTR_HASH.

2017-07-13 Thread Ilya Maximets
If RSS hash exists in a packet it can be reused instead of
5 tuple hash re-calculation in OVS_ACTION_ATTR_HASH. This
leads to increasing the performance of sending packets to
the OVS bonding in userspace datapath up to 10-15%.

Additionally fixed unit test 'select group with dp_hash
selection method' to not depend on dp_hash value.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---

Version 2:
* Removed assumption on hash_basis value.
* hash_finish replaced with hash_int as more appropriate.
* Fixed 'select group with dp_hash selection method' UT.

 lib/odp-execute.c | 11 +--
 tests/ofproto-dpif.at |  4 ++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index d656334..03120bf 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -646,8 +646,15 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
*batch, bool steal,
 uint32_t hash;
 
 DP_PACKET_BATCH_FOR_EACH (packet, batch) {
-flow_extract(packet, );
-hash = flow_hash_5tuple(, hash_act->hash_basis);
+/* RSS hash can be used here instead of 5tuple for
+ * performance reasons. */
+if (dp_packet_rss_valid(packet)) {
+hash = dp_packet_get_rss_hash(packet);
+hash = hash_int(hash, hash_act->hash_basis);
+} else {
+flow_extract(packet, );
+hash = flow_hash_5tuple(, hash_act->hash_basis);
+}
 packet->md.dp_hash = hash;
 }
 } else {
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index 8373f90..83c72cf 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -491,10 +491,10 @@ for d in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
 AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt])
 done
 
-AT_CHECK([ovs-appctl dpctl/dump-flows | sed 
's/dp_hash(.*\/0x1)/dp_hash(0x\/0x1)/' | strip_ufid | strip_used | sort], 
[0], [dnl
+AT_CHECK([ovs-appctl dpctl/dump-flows | sed 
's/dp_hash(.*\/0x1)/dp_hash(0x\/0x1)/' | sed 's/\(actions:1\)[[01]]/\1X/' | 
strip_ufid | strip_used | sort], [0], [dnl
 flow-dump from non-dpdk interfaces:
 
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no),
 packets:15, bytes:630, used:0.0s, actions:hash(hash_l4(0)),recirc(0x2)
-recirc_id(0x2),dp_hash(0x/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 packets:15, bytes:630, used:0.0s, actions:11
+recirc_id(0x2),dp_hash(0x/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 packets:15, bytes:630, used:0.0s, actions:1X
 ])
 
 OVS_VSWITCHD_STOP
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] dp-packet: Remove misleading comment for refill init function.

2017-07-14 Thread Ilya Maximets
Function 'dp_packet_batch_refill_init' doesn't return anything.
Looks like this comment came from one of the intermediate versions
of the API enhancement patch. Additionally comment style changed
to be consistent with other comments in the same file.

CC: Andy Zhou <az...@ovn.org>
Fixes: 72c84bc2db23 ("dp-packet: Enhance packet batch APIs.")
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/dp-packet.h | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 38282bd..c5fe32e 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -678,11 +678,8 @@ dp_packet_batch_size(const struct dp_packet_batch *batch)
 return batch->count;
 }
 
-/*
- * Clear 'batch' for refill. Use dp_packet_batch_refill() to add
- * packets back into the 'batch'.
- *
- * Return the original size of the 'batch'.  */
+/* Clear 'batch' for refill. Use dp_packet_batch_refill() to add
+ * packets back into the 'batch'. */
 static inline void
 dp_packet_batch_refill_init(struct dp_packet_batch *batch)
 {
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [patch_v1 2/2] System Tests: Improve reliability of an icmp test.

2017-07-17 Thread Ilya Maximets
Not a full review. And I most likely will not review the logic.
Just one comment inline.

Best regards, Ilya Maximets.

> One SNAT test is based on a single ping being successful;
> to make the result more predictable, static arp binding is now used.
> Occasionally, tracing shows the reply side stack does not respond,
> but this is much less common with this change.
> I considered changing the test design itself, but I thought that
> would not be testing the same situation.
> 
> Signed-off-by: Darrell Ball 
> ---
>  tests/system-traffic.at | 35 +--
>  1 file changed, 21 insertions(+), 14 deletions(-)
> 
> diff --git a/tests/system-traffic.at b/tests/system-traffic.at
> index 1ebf928..bc126bc 100644
> --- a/tests/system-traffic.at
> +++ b/tests/system-traffic.at
> @@ -2693,8 +2693,28 @@ OVS_TRAFFIC_VSWITCHD_START()
>  ADD_NAMESPACES(at_ns0, at_ns1)
>  
>  ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
> -NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88])
> +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns0], [arp -s 10.1.1.2 e6:66:c1:22:22:22])
> +
>  ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
> +NS_CHECK_EXEC([at_ns1], [ip link set dev p1 address e6:66:c1:22:22:22])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.1 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.240 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.241 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.242 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.243 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.244 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.245 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.246 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.247 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.248 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.249 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.250 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.251 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.252 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.253 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.254 e6:66:c1:11:11:11])
> +NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.255 e6:66:c1:11:11:11])

How about to replace above lines with a loop:

for i in 1 `seq 240 255`; do
NS_CHECK_EXEC([at_ns1], [arp -s 10.1.1.$i e6:66:c1:11:11:11])
done

?

>  
>  dnl Allow any traffic from ns0->ns1. Only allow nd, return traffic from 
> ns1->ns0.
>  AT_DATA([flows.txt], [dnl
> @@ -2707,20 +2727,7 @@ dnl
>  
> in_port=2,ct_state=+trk,ct_zone=1,ip,action=ct(table=1,commit,zone=1,exec(set_field:1->ct_mark)),1
>  table=1,in_port=2,ct_mark=1,ct_state=+rpl,ct_zone=1,ip,action=1
>  dnl
> -dnl ARP
> -priority=100 arp arp_op=1 
> action=move:OXM_OF_ARP_TPA[[]]->NXM_NX_REG2[[]],resubmit(,8),goto_table:10
> -priority=10 arp action=normal
>  priority=0,action=drop
> -dnl
> -dnl MAC resolution table for IP in reg2, stores mac in OXM_OF_PKT_REG0
> -table=8,reg2=0x0a0101f0/0xfff0,action=load:0x8088->OXM_OF_PKT_REG0[[]]
> -table=8,priority=0,action=load:0->OXM_OF_PKT_REG0[[]]
> -dnl ARP responder mac filled in at OXM_OF_PKT_REG0, or 0 for normal action.
> -dnl TPA IP in reg2.
> -dnl Swaps the fields of the ARP message to turn a query to a response.
> -table=10 priority=100 arp xreg0=0 action=normal
> -table=10 
> priority=10,arp,arp_op=1,action=load:2->OXM_OF_ARP_OP[[]],move:OXM_OF_ARP_SHA[[]]->OXM_OF_ARP_THA[[]],move:OXM_OF_PKT_REG0[[0..47]]->OXM_OF_ARP_SHA[[]],move:OXM_OF_ARP_SPA[[]]->OXM_OF_ARP_TPA[[]],move:NXM_NX_REG2[[]]->OXM_OF_ARP_SPA[[]],move:NXM_OF_ETH_SRC[[]]->NXM_OF_ETH_DST[[]],move:OXM_OF_PKT_REG0[[0..47]]->NXM_OF_ETH_SRC[[]],move:NXM_OF_IN_PORT[[]]->NXM_NX_REG3[[0..15]],load:0->NXM_OF_IN_PORT[[]],output:NXM_NX_REG3[[0..15]]
> -table=10 priority=0 action=drop
>  ])
>  
>  AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
> -- 
> 1.9.1
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] odp-execute: Reuse rss hash in OVS_ACTION_ATTR_HASH.

2017-07-11 Thread Ilya Maximets
If RSS hash exists in a packet it can be reused instead of
5 tuple hash re-calculation in OVS_ACTION_ATTR_HASH. This
leads to increasing the performance of sending packets to
the OVS bonding in userspace datapath up to 10-15%.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/odp-execute.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index d656334..471a364 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -646,8 +646,17 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
*batch, bool steal,
 uint32_t hash;
 
 DP_PACKET_BATCH_FOR_EACH (packet, batch) {
-flow_extract(packet, );
-hash = flow_hash_5tuple(, hash_act->hash_basis);
+/* RSS hash can be used here instead of 5tuple for
+ * performance reasons. */
+if (dp_packet_rss_valid(packet)) {
+hash = dp_packet_get_rss_hash(packet);
+if (hash_act->hash_basis) {
+hash = hash_finish(hash, hash_act->hash_basis);
+}
+} else {
+flow_extract(packet, );
+hash = flow_hash_5tuple(, hash_act->hash_basis);
+}
 packet->md.dp_hash = hash;
 }
 } else {
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v3 0/2] conntrack : Add support for rx chceksum offload.

2017-07-10 Thread Ilya Maximets
> ‘chceksum’ is misspelled
> 
> Since these patches really only affect ‘dpdk’, the module name ‘dpdk’ may 
> more accurately
> reflect the real effect of these patches.

Please, don't do that. Only patches that changes lib/dpdk.{c,h} should
have 'dpdk' prefix in subject line. All other patches should have proper
module name according to code they're changing.

I wanted to rise this issue many times ago. So, maybe it's time.
There are many places where changes made to improve the DPDK-enabled
datapath, but the most of changes are generic and doesn't have many
DPDK-related code. Such patches doesn't need to have 'dpdk' as a prefix.
This only makes a mess from the git history and you can never say for
sure what module was changed in a particular patch by looking only on its
subject.

IMHO, patches should have prefixes according to modules they're changing
like it is described in contribution guide. Generic changes should be
reviewed by not only people interested in DPDK. Addition of such
misleading prefixes forces them to miss maybe important generic changes.

From the other side, many people adds 'dpdk' prefix to patches targeted
to 'netdev-dpdk' which is not right too.
All patches should have the right prefix according to the module they are
trying to change. That is my point of view.


In this particular case patches actually adds generic functionality
which can be used even without DPDK. For example, if we'll implement
checksum offloading for netdev-linux (not so hard). DPDK already mentioned
in commit message as the target and there is no need for misleading prefixes.

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v9] dpif-netdev: Assign ports to pmds on non-local numa node.

2017-07-10 Thread Ilya Maximets
On 08.07.2017 22:09, Stokes, Ian wrote:
>> Previously if there is no available (non-isolated) pmd on the numa node
>> for a port then the port is not polled at all. This can result in a non-
>> operational system until such time as nics are physically repositioned. It
>> is preferable to operate with a pmd on the 'wrong' numa node albeit with
>> lower performance. Local pmds are still chosen when available.
>>
>> Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
>> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
>> Co-authored-by: Ilya Maximets <i.maxim...@samsung.com>
>> ---
>> v9: v8 missed some comments on v7
>> v8: Some coding style issues; doc tweak
>> v7: Incorporate review comments on docs and implementation
>> v6: Change 'port' to 'queue' in a warning msg
>> v5: Fix warning msg; Update same in docs
>> v4: Fix a checkpatch error
>> v3: Fix warning messages not appearing when using multiqueue
>> v2: Add details of warning messages into docs
>>
>>  Documentation/intro/install/dpdk.rst | 21 +++---
>>  lib/dpif-netdev.c| 41
>> +---
>>  2 files changed, 56 insertions(+), 6 deletions(-)
>>
>> diff --git a/Documentation/intro/install/dpdk.rst
>> b/Documentation/intro/install/dpdk.rst
>> index e83f852..89775d6 100644
>> --- a/Documentation/intro/install/dpdk.rst
>> +++ b/Documentation/intro/install/dpdk.rst
>> @@ -449,7 +449,7 @@ affinitized accordingly.
>>
>>A poll mode driver (pmd) thread handles the I/O of all DPDK interfaces
>>assigned to it. A pmd thread shall poll the ports for incoming packets,
>> -  switch the packets and send to tx port.  pmd thread is CPU bound, and
>> needs
>> +  switch the packets and send to tx port.  A pmd thread is CPU bound,
>> + and needs
>>to be affinitized to isolated cores for optimum performance.
>>
>>By setting a bit in the mask, a pmd thread is created and pinned to the
>> @@ -458,8 +458,23 @@ affinitized accordingly.
>>$ ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x4
>>
>>.. note::
>> -pmd thread on a NUMA node is only created if there is at least one
>> DPDK
>> -interface from that NUMA node added to OVS.
>> +A pmd thread on a NUMA node is only created if there is at least one
>> DPDK
>> +interface from that NUMA node added to OVS.  A pmd thread is created
>> by
>> +default on a core of a NUMA node or when a specified pmd-cpu-mask has
>> +indicated so.  Even though a PMD thread may exist, the thread only
>> starts
>> +consuming CPU cycles if there is least one receive queue assigned to
>> +the pmd.
>> +
>> +  .. note::
>> +On NUMA systems PCI devices are also local to a NUMA node.  Unbound
>> rx
>> +queues for a PCI device will assigned to a pmd on it's local NUMA
> 
> Minor point but should read 'will be assigned'
>> node if a
>> +non-isolated PMD exists on that NUMA node.  If not, the queue will be
>> +assigned to a non-isolated pmd on a remote NUMA node.  This will
>> result in
>> +reduced maximum throughput on that device and possibly on other
>> devices
>> +assigned to that pmd thread. In the case such, a queue assignment is
>> made a
>> +warning message will be logged: "There's no available (non-isolated)
>> pmd
> 
> Above should read 'In the case where such a queue assignment is made, a 
> warning message will be logged'
>> +thread on numa node N. Queue Q on port P will be assigned to the pmd
>> on
>> +core C (numa node N'). Expect reduced performance."
>>
>>  - QEMU vCPU thread Affinity
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4e29085..7557f32
>> 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -3195,6 +3195,23 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int
>> numa_id)
>>  return NULL;
>>  }
>>
>> +/* Returns next NUMA from rr list in round-robin fashion. Returns the
>> +first
>> + * NUMA node if 'NULL' or the last node passed, and 'NULL' if list is
>> +empty. */ static struct rr_numa * rr_numa_list_next(struct rr_numa_list
>> +*rr, const struct rr_numa *numa) {
> 
> The comment above can be tidied up a little to better clarify the behavior of 
> this function.
> I ended up reading the comments for hmap_next() and hmap_first() before it 
> made sense, and even then it's a bit ambiguous, it ends up being the code 
> that expla

Re: [ovs-dev] [PATCH v9] dpif-netdev: Assign ports to pmds on non-local numa node.

2017-07-10 Thread Ilya Maximets
On 10.07.2017 13:42, O Mahony, Billy wrote:
> 
> 
>> -Original Message-
>> From: Stokes, Ian
>> Sent: Monday, July 10, 2017 10:41 AM
>> To: Ilya Maximets <i.maxim...@samsung.com>; O Mahony, Billy
>> <billy.o.mah...@intel.com>; d...@openvswitch.org
>> Cc: db...@vmare.com
>> Subject: RE: [ovs-dev] [PATCH v9] dpif-netdev: Assign ports to pmds on non-
>> local numa node.
>>
>>> On 08.07.2017 22:09, Stokes, Ian wrote:
>>>>> Previously if there is no available (non-isolated) pmd on the numa
>>>>> node for a port then the port is not polled at all. This can result
>>>>> in a non- operational system until such time as nics are physically
>>>>> repositioned. It is preferable to operate with a pmd on the 'wrong'
>>>>> numa node albeit with lower performance. Local pmds are still
>>>>> chosen
>>> when available.
>>>>>
>>>>> Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
>>>>> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
>>>>> Co-authored-by: Ilya Maximets <i.maxim...@samsung.com>
>>>>> ---
>>>>> v9: v8 missed some comments on v7
>>>>> v8: Some coding style issues; doc tweak
>>>>> v7: Incorporate review comments on docs and implementation
>>>>> v6: Change 'port' to 'queue' in a warning msg
>>>>> v5: Fix warning msg; Update same in docs
>>>>> v4: Fix a checkpatch error
>>>>> v3: Fix warning messages not appearing when using multiqueue
>>>>> v2: Add details of warning messages into docs
>>>>>
>>>>>  Documentation/intro/install/dpdk.rst | 21 +++---
>>>>>  lib/dpif-netdev.c| 41
>>>>> +---
>>>>>  2 files changed, 56 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/Documentation/intro/install/dpdk.rst
>>>>> b/Documentation/intro/install/dpdk.rst
>>>>> index e83f852..89775d6 100644
>>>>> --- a/Documentation/intro/install/dpdk.rst
>>>>> +++ b/Documentation/intro/install/dpdk.rst
>>>>> @@ -449,7 +449,7 @@ affinitized accordingly.
>>>>>
>>>>>A poll mode driver (pmd) thread handles the I/O of all DPDK
>>> interfaces
>>>>>assigned to it. A pmd thread shall poll the ports for incoming
>>>>> packets,
>>>>> -  switch the packets and send to tx port.  pmd thread is CPU
>>>>> bound, and needs
>>>>> +  switch the packets and send to tx port.  A pmd thread is CPU
>>>>> + bound, and needs
>>>>>to be affinitized to isolated cores for optimum performance.
>>>>>
>>>>>By setting a bit in the mask, a pmd thread is created and pinned
>>>>> to the @@ -458,8 +458,23 @@ affinitized accordingly.
>>>>>$ ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x4
>>>>>
>>>>>.. note::
>>>>> -pmd thread on a NUMA node is only created if there is at least one
>>>>> DPDK
>>>>> -interface from that NUMA node added to OVS.
>>>>> +A pmd thread on a NUMA node is only created if there is at
>>>>> + least one
>>>>> DPDK
>>>>> +interface from that NUMA node added to OVS.  A pmd thread is
>>>>> + created
>>>>> by
>>>>> +default on a core of a NUMA node or when a specified
>>>>> + pmd-cpu-mask
>>> has
>>>>> +indicated so.  Even though a PMD thread may exist, the thread
>>>>> + only
>>>>> starts
>>>>> +consuming CPU cycles if there is least one receive queue
>>>>> + assigned
>>> to
>>>>> +the pmd.
>>>>> +
>>>>> +  .. note::
>>>>> +On NUMA systems PCI devices are also local to a NUMA node.
>>>>> + Unbound
>>>>> rx
>>>>> +queues for a PCI device will assigned to a pmd on it's local
>>>>> + NUMA
>>>>
>>>> Minor point but should read 'will be assigned'
> 
> [[BO'M]] 
> +1
> 
>>>>> node if a
>>>>> +non-isolated PMD exists on that NUMA node.  If not, the queue
>>>>> + will
>>> be
>>>>> +assigned to a non-isolated pmd

Re: [ovs-dev] [PATCH v2 2/3] dpif-netdev: Avoid port's reconfiguration on pmd-cpu-mask changes.

2017-07-11 Thread Ilya Maximets
On 11.07.2017 05:10, Darrell Ball wrote:
> 
> 
> On 7/10/17, 12:41 AM, "Ilya Maximets" <i.maxim...@samsung.com> wrote:
> 
> On 07.07.2017 21:09, Darrell Ball wrote:
> > 
>     > 
> > On 7/6/17, 11:11 PM, "Ilya Maximets" <i.maxim...@samsung.com> wrote:
> > 
> > On 07.07.2017 08:08, Darrell Ball wrote:
> > > 
> > > 
> > > On 5/30/17, 7:12 AM, "Ilya Maximets" <i.maxim...@samsung.com> 
> wrote:
> > > 
> > > Reconfiguration of HW NICs may lead to packet drops.
> > > In current model all physical ports will be reconfigured each
> > > time number of PMD threads changed. Since we not stopping
> > > threads on pmd-cpu-mask changes, this patch will help to 
> further
> > > decrease port's downtime by setting the maximum possible 
> number
> > > of wanted tx queues to avoid unnecessary reconfigurations.
> > > 
> > > Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
> > > ---
> > >  lib/dpif-netdev.c | 26 +-
> > >  1 file changed, 21 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > > index 596d133..79db770 100644
> > > --- a/lib/dpif-netdev.c
> > > +++ b/lib/dpif-netdev.c
> > > @@ -3453,7 +3453,7 @@ reconfigure_datapath(struct dp_netdev 
> *dp)
> > >  {
> > >  struct dp_netdev_pmd_thread *pmd;
> > >  struct dp_netdev_port *port;
> > > -int wanted_txqs;
> > > +int needed_txqs, wanted_txqs;
> > >  
> > >  dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
> > >  
> > > @@ -3461,7 +3461,15 @@ reconfigure_datapath(struct dp_netdev 
> *dp)
> > >   * on the system and the user configuration. */
> > >  reconfigure_pmd_threads(dp);
> > >  
> > > -wanted_txqs = cmap_count(>poll_threads);
> > > +/* We need 1 Tx queue for each thread to avoid locking, 
> but we will try
> > > + * to allocate the maximum possible value to minimize 
> the number of port
> > > + * reconfigurations. */
> > > +needed_txqs = cmap_count(>poll_threads);
> > > +/* (n_cores + 1) is the maximum that we might need to 
> have.
> > > + * Additional queue is for non-PMD threads. */
> > > +wanted_txqs = ovs_numa_get_n_cores();
> > > +ovs_assert(wanted_txqs != OVS_CORE_UNSPEC);
> > > +wanted_txqs++;
> > > 
> > > I don’t think PMD mask changes are common, so this patch is 
> trying to optimize a 
> > > rare disruptive event that can/will be scheduled by the 
> administrator.
> > > 
> > > Based on the actual number of queues supported and the number of 
> cores present,
> > > this optimization may or may not work. It is unpredictable 
> whether there will be benefit
> > > in a particular case from the user POV.
> > > If I were the administrator, I would try to error on the 
> conservative side anyways if I could
> > > not predict the result.
> > > 
> > > Did I miss something ?
> > 
> > In NFV environment if you want to add one more VM to your hosts you 
> will have to
> > choose between:
> > 
> > * not creating a new PMD thread
> > -> performance degradation of networking for other 
> working VMs
> > 
> > 
> > * adding of the new PMD thread
> > -> desruption of the networking for the whole host for 
> the time
> >of OVS reconfiguration.
> > 
> > This patch removes the cost of the second option.
> > 
> > In general, adding a PMD thread per VM may not always (or even usually) 
> make sense.
> 
> Not per VM. Lets assume that all existing threads are already overloaded.
> 
> That would be less c

Re: [ovs-dev] [PATCH v3 0/3] Incremental addition/deletion of PMD threads.

2017-07-11 Thread Ilya Maximets
Patches 1 and 3 are also reviewed by Darrell now.
So, maybe we can apply them before we finish our holy war about patch #2 ?

We can easily apply #3 without #2 because they have nothing in common.

Best regards, Ilya Maximets.

On 05.07.2017 17:30, Ilya Maximets wrote:
> Hi all.
> IMHO, these patches was reviewed and tested enough to be merged.
> Does anybody have objections?
> Ben, Darrell, what do you think?
> 
> Best regards, Ilya Maximets.
> 
> On 15.06.2017 14:36, Ilya Maximets wrote:
>> Version 3:
>>  * Added comment about 'static_txq_id's adjustment.
>>  * Added additional parentheses around 'dynamic_txqs'
>>comparison operand because of warning from GCC 6.3.1.
>>
>> Version 2:
>>  * Dropped patch [1/4] as already applied to master.
>>  * Fixed checkpatch warning in patch [2/4].
>>  * 'reconfigure_pmd_threads' modified for better maintainability
>>as suggested by Daniele Di Proietto in patch [2/4]:
>>1. delete old pmd threads
>>2. reconfigure if 'static_tx_qid's adjustment needed
>>3. add new pmd threads
>>  * Fixed locking of HW tx queues in case of big amount of
>>available cores in patch [3/4].
>>  * Dropped RFC tag for patch [4/4].
>>
>> Ilya Maximets (3):
>>   dpif-netdev: Incremental addition/deletion of PMD threads.
>>   dpif-netdev: Avoid port's reconfiguration on pmd-cpu-mask changes.
>>   dpif-netdev: Don't uninit emc on reload.
>>
>>  lib/dpif-netdev.c | 176 
>> --
>>  tests/pmd.at  |   2 +-
>>  2 files changed, 131 insertions(+), 47 deletions(-)

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 0/4] Checkpatch enhancements.

2017-07-14 Thread Ilya Maximets
Ilya Maximets (4):
  checkpatch: Don't allow Gerrit Change-Ids.
  checkpatch: Print results while checking HEAD and stdin.
  checkpatch: Allow checking more than one file.
  checkpatch: Print commit hashes and names.

 utilities/checkpatch.py | 64 +
 1 file changed, 49 insertions(+), 15 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 1/4] checkpatch: Don't allow Gerrit Change-Ids.

2017-07-14 Thread Ilya Maximets
Local Gerrit Change-Ids are not welcome in common repository.
Inspired by checkpatch.pl from Linux Kernel.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 utilities/checkpatch.py | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py
index 65d188d..fe266ac 100755
--- a/utilities/checkpatch.py
+++ b/utilities/checkpatch.py
@@ -320,6 +320,8 @@ def ovs_checkpatch_parse(text, filename):
   re.I | re.M | re.S)
 is_co_author = re.compile(r'(\s*(Co-authored-by: )(.*))$',
   re.I | re.M | re.S)
+is_gerrit_change_id = re.compile(r'(\s*(change-id: )(.*))$',
+ re.I | re.M | re.S)
 
 for line in text.split('\n'):
 if current_file != previous_file:
@@ -357,6 +359,10 @@ def ovs_checkpatch_parse(text, filename):
 elif is_co_author.match(line):
 m = is_co_author.match(line)
 co_authors.append(m.group(3))
+elif is_gerrit_change_id.match(line):
+print_error(
+"Remove Gerrit Change-Id's before submitting upstream.")
+print("%d: %s\n" % (lineno, line))
 elif parse == 2:
 newfile = hunks.match(line)
 if newfile:
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 3/4] checkpatch: Allow checking more than one file.

2017-07-14 Thread Ilya Maximets
Currently to check more than one patch or file it's required
to invoke script for each file separately.
Fix that by iterating over all the passed filenames.

Note: If '-f' option passed, all the files treated as usual files.
  Without '-f' all the files treated as patch files.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 utilities/checkpatch.py | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py
index 4a92890..7ccec51 100755
--- a/utilities/checkpatch.py
+++ b/utilities/checkpatch.py
@@ -408,7 +408,7 @@ def usage():
 Open vSwitch checkpatch.py
 Checks a patch for trivial mistakes.
 usage:
-%s [options] [PATCH | -f SOURCE | -1 | -2 | ...]
+%s [options] [PATCH1 [PATCH2 ...] | -f SOURCE1 [SOURCE2 ...] | -1 | -2 | ...]
 
 Input options:
 -f|--check-fileArguments are source files, not patches.
@@ -513,13 +513,18 @@ if __name__ == '__main__':
 status = -1
 sys.exit(status)
 
-try:
-filename = args[0]
-except:
+if not args:
 if sys.stdin.isatty():
 usage()
 sys.exit(-1)
 result = ovs_checkpatch_parse(sys.stdin.read(), '-')
 ovs_checkpatch_print_result(result)
 sys.exit(result)
-sys.exit(ovs_checkpatch_file(filename))
+
+status = 0
+for filename in args:
+print('== Checking "%s" ==' % filename)
+result = ovs_checkpatch_file(filename)
+if result:
+status = -1
+sys.exit(status)
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 2/4] checkpatch: Print results while checking HEAD and stdin.

2017-07-14 Thread Ilya Maximets
Currently, result status printed only for patch files.
It'll be nice to have results for other checking types.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 utilities/checkpatch.py | 33 +
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py
index fe266ac..4a92890 100755
--- a/utilities/checkpatch.py
+++ b/utilities/checkpatch.py
@@ -63,6 +63,13 @@ def print_warning(message):
 __warnings = __warnings + 1
 
 
+def reset_counters():
+global __errors, __warnings
+
+__errors = 0
+__warnings = 0
+
+
 # These are keywords whose names are normally followed by a space and
 # something in parentheses (usually an expression) then a left curly brace.
 #
@@ -323,6 +330,8 @@ def ovs_checkpatch_parse(text, filename):
 is_gerrit_change_id = re.compile(r'(\s*(change-id: )(.*))$',
  re.I | re.M | re.S)
 
+reset_counters()
+
 for line in text.split('\n'):
 if current_file != previous_file:
 previous_file = current_file
@@ -414,8 +423,16 @@ Check options:
   % sys.argv[0])
 
 
+def ovs_checkpatch_print_result(result):
+global __warnings, __errors, total_line
+if result < 0:
+print("Lines checked: %d, Warnings: %d, Errors: %d\n" %
+  (total_line, __warnings, __errors))
+else:
+print("Lines checked: %d, no obvious problems found\n" % (total_line))
+
+
 def ovs_checkpatch_file(filename):
-global __warnings, __errors, checking_file, total_line
 try:
 mail = email.message_from_file(open(filename, 'r'))
 except:
@@ -426,11 +443,7 @@ def ovs_checkpatch_file(filename):
 if part.get_content_maintype() == 'multipart':
 continue
 result = ovs_checkpatch_parse(part.get_payload(decode=False), filename)
-if result < 0:
-print("Lines checked: %d, Warnings: %d, Errors: %d" %
-  (total_line, __warnings, __errors))
-else:
-print("Lines checked: %d, no obvious problems found" % (total_line))
+ovs_checkpatch_print_result(result)
 return result
 
 
@@ -494,7 +507,9 @@ if __name__ == '__main__':
 f.close()
 
 print('== Checking %s ==' % revision)
-if ovs_checkpatch_parse(patch, revision):
+result = ovs_checkpatch_parse(patch, revision)
+ovs_checkpatch_print_result(result)
+if result:
 status = -1
 sys.exit(status)
 
@@ -504,5 +519,7 @@ if __name__ == '__main__':
 if sys.stdin.isatty():
 usage()
 sys.exit(-1)
-sys.exit(ovs_checkpatch_parse(sys.stdin.read(), '-'))
+result = ovs_checkpatch_parse(sys.stdin.read(), '-')
+ovs_checkpatch_print_result(result)
+sys.exit(result)
 sys.exit(ovs_checkpatch_file(filename))
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 4/4] checkpatch: Print commit hashes and names.

2017-07-14 Thread Ilya Maximets
It's better to see real commits instead of 'HEAD~n'.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 utilities/checkpatch.py | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py
index 7ccec51..f61b8b7 100755
--- a/utilities/checkpatch.py
+++ b/utilities/checkpatch.py
@@ -500,13 +500,19 @@ if __name__ == '__main__':
 
 if n_patches:
 status = 0
+
+git_log = 'git log --no-color --no-merges --pretty=format:"%H %s" '
+f = os.popen(git_log + '-%d' % n_patches, 'r')
+commits = f.read().split("\n")
+f.close()
+
 for i in reversed(range(0, n_patches)):
-revision = 'HEAD~%d' % i
+revision, name = commits[i].split(" ", 1)
 f = os.popen('git format-patch -1 --stdout %s' % revision, 'r')
 patch = f.read()
 f.close()
 
-print('== Checking %s ==' % revision)
+print('== Checking %s ("%s") ==' % (revision[0:12], name))
 result = ovs_checkpatch_parse(patch, revision)
 ovs_checkpatch_print_result(result)
 if result:
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2] odp-execute: Reuse rss hash in OVS_ACTION_ATTR_HASH.

2017-07-18 Thread Ilya Maximets
On 18.07.2017 06:48, Darrell Ball wrote:
> 
> 
> On 7/13/17, 8:07 AM, "ovs-dev-boun...@openvswitch.org on behalf of Ilya 
> Maximets" <ovs-dev-boun...@openvswitch.org on behalf of 
> i.maxim...@samsung.com> wrote:
> 
> If RSS hash exists in a packet it can be reused instead of
> 5 tuple hash re-calculation in OVS_ACTION_ATTR_HASH. This
> leads to increasing the performance of sending packets to
> the OVS bonding in userspace datapath up to 10-15%.
> 
> Additionally fixed unit test 'select group with dp_hash
> selection method' to not depend on dp_hash value.
> 
> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
> ---
> 
> Version 2:
>   * Removed assumption on hash_basis value.
>   * hash_finish replaced with hash_int as more appropriate.
>   * Fixed 'select group with dp_hash selection method' UT.
> 
>  lib/odp-execute.c | 11 +--
>  tests/ofproto-dpif.at |  4 ++--
>  2 files changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/lib/odp-execute.c b/lib/odp-execute.c
> index d656334..03120bf 100644
> --- a/lib/odp-execute.c
> +++ b/lib/odp-execute.c
> @@ -646,8 +646,15 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
> *batch, bool steal,
>  uint32_t hash;
>  
>  DP_PACKET_BATCH_FOR_EACH (packet, batch) {
> -flow_extract(packet, );
> -hash = flow_hash_5tuple(, hash_act->hash_basis);
> +/* RSS hash can be used here instead of 5tuple for
> + * performance reasons. */
> +if (dp_packet_rss_valid(packet)) {
> +hash = dp_packet_get_rss_hash(packet);
> +hash = hash_int(hash, hash_act->hash_basis);
> +} else {
> +flow_extract(packet, );
> +hash = flow_hash_5tuple(, 
> hash_act->hash_basis);
> +}
> 
> Presently, OVS does not have configurable hashing fields for bonds, although 
> this seems to be asked for.
> Also, OVS does not have symmetrical hashing for bonding, as exists for the 
> multipath action.
> If/when these features are added, taking the RSS of various input interfaces 
> to hash across the outgoing members of
> a given bond would not automatically work since a flexible hash algorithm 
> would not be easily configured the same
> across different input devices and also enforcing symmetry would be similarly 
> difficult.
> Potentially, we could also make these features mutually exclusive with using 
> the RSS hash as is done here.
> 
> This patch does offer some performance gain, so we could revisit as needed in 
> the above cases ?

For configurable hashing fields, I think, there should be different 
OVS_HASH_ALG type.
Symmetric hashing is also not required for bonding to work correctly.
>From the other side kernel datapath uses exactly same thing to execute 
>OVS_ACTION_ATTR_HASH.
skb_get_hash() is used there which is equal to RSS hash if it available.
So, this patch just unifies kernel and userspace ways to execute hash actions.
If some additional characteristics will be required we will modify both 
datapaths accordingly.

Best regards, Ilya Maximets.

>  packet->md.dp_hash = hash;
>  }
>  } else {
> diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
> index 8373f90..83c72cf 100644
> --- a/tests/ofproto-dpif.at
> +++ b/tests/ofproto-dpif.at
> @@ -491,10 +491,10 @@ for d in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
>  AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt])
>  done
>  
> -AT_CHECK([ovs-appctl dpctl/dump-flows | sed 
> 's/dp_hash(.*\/0x1)/dp_hash(0x\/0x1)/' | strip_ufid | strip_used | sort], 
> [0], [dnl
> +AT_CHECK([ovs-appctl dpctl/dump-flows | sed 
> 's/dp_hash(.*\/0x1)/dp_hash(0x\/0x1)/' | sed 's/\(actions:1\)[[01]]/\1X/' 
> | strip_ufid | strip_used | sort], [0], [dnl
>  flow-dump from non-dpdk interfaces:
>  
> recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no),
>  packets:15, bytes:630, used:0.0s, actions:hash(hash_l4(0)),recirc(0x2)
> 
> -recirc_id(0x2),dp_hash(0x/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
>  packets:15, bytes:630, used:0.0s, actions:11
> 
> +recirc_id(0x2),dp_hash(0x/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
>  packets:15, bytes:630, used:0.0s, actions:1X
>  ])
> 

Re: [ovs-dev] [PATCH v2 2/3] dpif-netdev: Avoid port's reconfiguration on pmd-cpu-mask changes.

2017-07-10 Thread Ilya Maximets
On 07.07.2017 21:09, Darrell Ball wrote:
> 
> 
> On 7/6/17, 11:11 PM, "Ilya Maximets" <i.maxim...@samsung.com> wrote:
> 
> On 07.07.2017 08:08, Darrell Ball wrote:
> > 
>     > 
> > On 5/30/17, 7:12 AM, "Ilya Maximets" <i.maxim...@samsung.com> wrote:
> > 
> > Reconfiguration of HW NICs may lead to packet drops.
> > In current model all physical ports will be reconfigured each
> > time number of PMD threads changed. Since we not stopping
> > threads on pmd-cpu-mask changes, this patch will help to further
> > decrease port's downtime by setting the maximum possible number
> >     of wanted tx queues to avoid unnecessary reconfigurations.
> > 
> > Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
> > ---
> >  lib/dpif-netdev.c | 26 +-
> >  1 file changed, 21 insertions(+), 5 deletions(-)
> > 
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index 596d133..79db770 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -3453,7 +3453,7 @@ reconfigure_datapath(struct dp_netdev *dp)
> >  {
> >  struct dp_netdev_pmd_thread *pmd;
> >  struct dp_netdev_port *port;
> > -int wanted_txqs;
> > +int needed_txqs, wanted_txqs;
> >  
> >  dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
> >  
> > @@ -3461,7 +3461,15 @@ reconfigure_datapath(struct dp_netdev *dp)
> >   * on the system and the user configuration. */
> >  reconfigure_pmd_threads(dp);
> >  
> > -wanted_txqs = cmap_count(>poll_threads);
> > +/* We need 1 Tx queue for each thread to avoid locking, but we 
> will try
> > + * to allocate the maximum possible value to minimize the 
> number of port
> > + * reconfigurations. */
> > +needed_txqs = cmap_count(>poll_threads);
> > +/* (n_cores + 1) is the maximum that we might need to have.
> > + * Additional queue is for non-PMD threads. */
> > +wanted_txqs = ovs_numa_get_n_cores();
> > +ovs_assert(wanted_txqs != OVS_CORE_UNSPEC);
> > +wanted_txqs++;
> > 
> > I don’t think PMD mask changes are common, so this patch is trying to 
> optimize a 
> > rare disruptive event that can/will be scheduled by the administrator.
> > 
> > Based on the actual number of queues supported and the number of cores 
> present,
> > this optimization may or may not work. It is unpredictable whether 
> there will be benefit
> > in a particular case from the user POV.
> > If I were the administrator, I would try to error on the conservative 
> side anyways if I could
> > not predict the result.
> > 
> > Did I miss something ?
> 
> In NFV environment if you want to add one more VM to your hosts you will 
> have to
> choose between:
> 
>   * not creating a new PMD thread
>   -> performance degradation of networking for other working VMs
> 
> 
>   * adding of the new PMD thread
>   -> desruption of the networking for the whole host for the time
>  of OVS reconfiguration.
> 
> This patch removes the cost of the second option.
> 
> In general, adding a PMD thread per VM may not always (or even usually) make 
> sense.

Not per VM. Lets assume that all existing threads are already overloaded.

> There are use cases for sure, but using a PMD core per VM is often viewed as 
> dpdk using too much
> cpu resources and limiting the adoption of dpdk.
> 
> Furthermore, for dpdk gateways it is irrelevant.

Disagree with that statement.

> 
> I don't understand what you mean saying 'unpredictable benefit'. The 
> benefit is clear
> and this optimization will work, except for HW NICs with very low number 
> of HW queues.
> 
> HW nic interfaces carry aggregated traffic for multiple VMs etc, so these 
> cases are most important.

And that is the point why this patch implemented. This patch tries to
minimize the packet drops on the "shared" between VMs HW NICs. It
allows to add new threads without breaking networking for others not
reconfiguring device that handles traffic for other VMs at this moment.

> It is the ratio of the number of cores to hw queues that matter. If queues 
&

Re: [ovs-dev] [patch_v6 0/5] Userspace Datapath: Add ALG support.

2017-07-17 Thread Ilya Maximets
> ALG infra is added with support for FTP and TFTP.
> Both V4 and V6 are supported.  Also, NAT is supported.
> 
> Three passive ftp system tests are added to complete testing
> coverage of ftp for the userspace datapath, as the existing
> coverage of passive ftp was limited to one part of one test
> for V4 only.
> Another system test is added covering tftp with NAT which
> was not previously exercised.
> 
> v5->v6: Re-instated include inadvertently removed.
> Improve 2 of the new system tests in terms of
> potential races.
> 
> v4->v5: Address Ben's code review comments.
> First 3 patches were committed.
> 
> v3->v4: Fix tftp with NAT.
> Add a system test covering tftp with NAT.
> 
> v2->v3: Fix v4 passive ftp with NAT.
> Fix V6 passive ftp; parse check was broken.
> Add 3 tests covering v4/v6 passive ftp to
> complete ALG coverage in the system tests.
> 
> Code review caught a memory leak of the alg
> string such as "ftp" that could occurs during
> nat tuple exhaustion. This is a pathological
> user error case whose fix was tested by
> instrumentated simulation.
> Code review also pointed out that a connection
> context copy was unclear; this was moved to the
> caller where all allocation and error cleanup is
> done.
> Added several lock annotations that were missing 
> from the original conntrack code and nat code.
> Other review comments were fixed.
> 
> v1->v2:
> Mostly the addition of V6 FTP and TFTP support.
> 
> Removed define for unused FTP server port 20.
> 
> Add overflow checks for port numbers.
> 
> Instead of bypassing FTP bounce exploit with
> auto-correct, explicitly flag packet as invalid.
> 
> Seq number overflow and underflow checks added.
> 
> Darrell Ball (5):
>   Userspace Datapath: Add ALG infra and FTP.
>   Userspace Datapath: Add TFTP support.
>   System tests: Enable ALGs for userspace.
>   System tests: Add 4 new ftp and tftp tests.
>   NEWS: Announce userspace datapath ALG support.

Hi Darrell,

This is not a full review. I just wanted to ask you to rename the patches
and stop using 'Userspace Datapath' or 'System tests' as a module name for
changes localized to only one particular module.

I already raised this issue for 'dpdk' prefix previously here:
https://mail.openvswitch.org/pipermail/ovs-dev/2017-July/335139.html

So, about current patch-set:
The first two patches are localized to lib/conntrack* files and should
have 'conntrack' prefix as a module name. 3rd and 4th patches should have
'system-userspace-macros' and 'system-traffic' module names respectively.
Such names will be more accurate and conformed to existing commits and
contribution guide.

Best regards, Ilya Maximets.

> 
>  NEWS |1 +
>  include/sparse/netinet/in.h  |1 +
>  lib/conntrack-private.h  |   35 +-
>  lib/conntrack.c  | 1088 
> +++---
>  lib/conntrack.h  |   10 +-
>  tests/system-traffic.at  |  242 +
>  tests/system-userspace-macros.at |7 +-
>  7 files changed, 1306 insertions(+), 78 deletions(-)

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [patch_v1 1/2] System Tests: Allow SNAT address variability retries.

2017-07-17 Thread Ilya Maximets
Same thing here. Not a full review.
Please, use 'system-traffic' for these patches as a prefix.

Details are in previous e-mail:
https://mail.openvswitch.org/pipermail/ovs-dev/2017-July/335751.html

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 1/2] bond: Fix broken rebalancing after link state changes.

2017-07-20 Thread Ilya Maximets
There are 3 constraints for moving hashes from one slave to another:

1. The load difference is larger than ~3% of one slave's load.
2. The load difference between slaves exceeds 10 bytes.
3. Moving of the hash makes the load difference lower by > 10%.

In current implementation if one of the slaves goes DOWN state, all
the hashes assigned to it will be moved to other slaves. After that,
if slave will go UP it will wait for rebalancing to get some hashes.
But in case where we have more than 10 equally loaded hashes it
will never fit constraint #3, because each hash will handle less than
10% of the load. Situation become worse when number of flows grows
higher and it's almost impossible to migrate any hash when all the
256 hash entries are used which is very likely when we have few
hundreds/thousands of flows.

As a result, if one of the slaves goes down and up while traffic
flows, it will never be used again for packet transmission.
Situation will not be fixed even if we'll stop traffic completely
and start it again because first two constraints will block
rebalancing on the earlier stages while we have low amount of traffic.

Moving of one hash if destination has no hashes as it was before
commit c460a6a7bc75 ("ofproto/bond: simplify rebalancing logic")
will not help because having one hash isn't enough to make load
difference less than 10% of total load and this slave will
handle only that one hash forever.

To fix this lets try to move few hashes simultaniously to fit
constraint #3.

Implementation includes sorting of 'entries' to be able to collect
entries with accumulated load close enough to ideal value.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---

I guess, the following tag should be correct, but not sure,
it was too long ago ...

Fixes: 5422a9e189c6 ("bonding: Balance bond slaves based on ratio.")

 ofproto/bond.c | 128 -
 1 file changed, 90 insertions(+), 38 deletions(-)

diff --git a/ofproto/bond.c b/ofproto/bond.c
index cb25a1d..75f7551 100644
--- a/ofproto/bond.c
+++ b/ofproto/bond.c
@@ -1073,49 +1073,72 @@ bond_shift_load(struct bond_entry *hash, struct 
bond_slave *to)
 bond->bond_revalidate = true;
 }
 
-/* Picks and returns a bond_entry to migrate from 'from' (the most heavily
+/* Picks and returns a 'bond_entry's to migrate from 'from' (the most heavily
  * loaded bond slave) to a bond slave that has 'to_tx_bytes' bytes of load,
  * given that doing so must decrease the ratio of the load on the two slaves by
- * at least 0.1.  Returns NULL if there is no appropriate entry.
+ * at least 0.1.  Returns number of entries filled in 'to_migrate'.
  *
- * The list of entries isn't sorted.  I don't know of a reason to prefer to
- * shift away small hashes or large hashes. */
-static struct bond_entry *
-choose_entry_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes)
+ * The list of entries is sorted in descending order of load.  This allows us
+ * to collect subset of entries with accumulated load close to ideal.  */
+static size_t
+choose_entries_to_migrate(const struct bond_slave *from, uint64_t to_tx_bytes,
+  struct bond_entry **to_migrate)
 OVS_REQ_WRLOCK(rwlock)
 {
 struct bond_entry *e;
+/* Note, the ideal traffic is the mid point between 'from' and 'to'.
+ * This value does not change by rebalancing.  */
+uint64_t ideal_tx_bytes = (from->tx_bytes + to_tx_bytes) / 2;
+uint64_t ideal_delta = ideal_tx_bytes - to_tx_bytes;
+uint64_t delta = 0; /* The amount to rebalance. */
+uint64_t new_low;   /* The lower bandwidth between 'to' and 'from'
+ * after rebalancing. */
+uint64_t migrating_threshold = ideal_delta / 10; /* 10% */
+size_t cnt = 0;
 
 if (ovs_list_is_short(>entries)) {
 /* 'from' carries no more than one MAC hash, so shifting load away from
  * it would be pointless. */
-return NULL;
+return 0;
 }
 
 LIST_FOR_EACH (e, list_node, >entries) {
-uint64_t delta = e->tx_bytes;  /* The amount to rebalance.  */
-uint64_t ideal_tx_bytes = (from->tx_bytes + to_tx_bytes)/2;
- /* Note, the ideal traffic is the mid point
-  * between 'from' and 'to'. This value does
-  * not change by rebalancing.  */
-uint64_t new_low;/* The lower bandwidth between 'to' and 'from'
-after rebalancing. */
-
-new_low = MIN(from->tx_bytes - delta, to_tx_bytes + delta);
-
-if ((new_low > to_tx_bytes) &&
-(new_low - to_tx_bytes >= (ideal_tx_bytes - to_tx_bytes) / 10)) {
-/* Only rebalance if the new 'low' is closer to to the mid point,
- * and the improvement exceeds 10% o

[ovs-dev] [PATCH 2/2] ofproto-dpif.at: Add bonding down/up rebalancing test.

2017-07-20 Thread Ilya Maximets
Add regression test which checks rebalancing of tcp balanced bonding
after link state changes of one of the ports.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
To work correctly this unit test requires fix for '--len' option of
netdev-dummy/receive appctl command:

* https://mail.openvswitch.org/pipermail/ovs-dev/2017-July/335930.html

 tests/ofproto-dpif.at | 96 +++
 1 file changed, 89 insertions(+), 7 deletions(-)

diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index 9941e35..f38dc3b 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -96,6 +96,22 @@ AT_CHECK([test `egrep 'in_port\(6\)' br1_flows.txt |wc -l` 
-gt 3])
 OVS_VSWITCHD_STOP
 AT_CLEANUP
 
+# SEND_TCP_BOND_PKTS([p_name], [p_ofport], [packet_len])
+#
+# Sends 256 packets to port 'p_name' with different TCP destination ports.
+m4_define([SEND_TCP_BOND_PKTS],
+   [
+len_cmd=""
+if test -n "$3"; then
+len_cmd=" --len $3"
+fi
+for i in `seq 0 255`; do
+
pkt="in_port($2),eth(src=50:54:00:00:00:05,dst=50:54:00:00:01:00),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=$i),tcp_flags(ack)"
+ovs-appctl netdev-dummy/receive $1 $pkt$len_cmd
+done
+   ]
+)
+
 AT_SETUP([ofproto-dpif - balance-tcp bonding])
 # Create br0 with interfaces bond0(p1, p2, p3) and p7,
 #and br1 with interfaces bond1(p4, p5, p6) and p8.
@@ -129,13 +145,7 @@ ovs-appctl time/stop
 ovs-appctl time/warp 100
 ovs-appctl lacp/show > lacp.txt
 ovs-appctl bond/show > bond.txt
-(
-for i in `seq 0 255` ;
-do
-
pkt="in_port(7),eth(src=50:54:00:00:00:05,dst=50:54:00:00:01:00),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=$i),tcp_flags(ack)"
-AT_CHECK([ovs-appctl netdev-dummy/receive p7 $pkt])
-done
-)
+AT_CHECK([SEND_TCP_BOND_PKTS([p7], [7])])
 ovs-appctl time/warp 300 100
 AT_CHECK([ovs-appctl dpif/dump-flows br0 |grep tcp > br0_flows.txt])
 AT_CHECK([ovs-appctl dpif/dump-flows br1 |grep tcp > br1_flows.txt])
@@ -148,6 +158,78 @@ AT_CHECK([test `grep in_port.6 br1_flows.txt |wc -l` -gt 
24])
 OVS_VSWITCHD_STOP()
 AT_CLEANUP
 
+# Make sure that rebalancing works after link state changes.
+AT_SETUP([ofproto-dpif - balance-tcp bonding rebalance after link state 
changes])
+# Create br0 with interfaces bond0(p1, p2) and p5,
+#and br1 with interfaces bond1(p3, p4) and p6.
+#bond0 <-> bond1
+# Send some traffic, set link state down and up for p2,
+# send big amount of traffic to trigger rebalancing and
+# make sure that some hashes rebalanced.
+OVS_VSWITCHD_START(
+  [add-bond br0 bond0 p1 p2 bond_mode=balance-tcp lacp=active \
+other-config:lacp-time=fast other-config:bond-rebalance-interval=1000 
--\
+   set interface p1 type=dummy options:pstream=punix:$OVS_RUNDIR/p1.sock 
ofport_request=1 mtu_request=65535 -- \
+   set interface p2 type=dummy options:pstream=punix:$OVS_RUNDIR/p2.sock 
ofport_request=2 mtu_request=65535 -- \
+   add-port br0 p5 -- set interface p5 ofport_request=5 type=dummy 
mtu_request=65535 -- \
+   add-br br1 -- \
+   set bridge br1 other-config:hwaddr=aa:66:aa:66:00:00 -- \
+   set bridge br1 datapath-type=dummy other-config:datapath-id=1234 \
+  fail-mode=secure -- \
+   add-bond br1 bond1 p3 p4 bond_mode=balance-tcp lacp=active \
+other-config:lacp-time=fast other-config:bond-rebalance-interval=1000 
--\
+   set interface p3 type=dummy options:stream=unix:$OVS_RUNDIR/p1.sock 
ofport_request=3 mtu_request=65535 -- \
+   set interface p4 type=dummy options:stream=unix:$OVS_RUNDIR/p2.sock 
ofport_request=4 mtu_request=65535 -- \
+   add-port br1 p6 -- set interface p6 ofport_request=6 type=dummy 
mtu_request=65535 --])
+AT_CHECK([ovs-appctl vlog/set bond:dbg])
+AT_CHECK([ovs-appctl netdev-dummy/set-admin-state up], 0, [OK
+])
+AT_CHECK([ovs-ofctl add-flow br0 action=normal])
+AT_CHECK([ovs-ofctl add-flow br1 action=normal])
+AT_CHECK([ovs-appctl upcall/disable-megaflows], [0], [megaflows disabled
+], [])
+OVS_WAIT_WHILE([ovs-appctl bond/show | grep "may_enable: false"])
+
+ovs-appctl time/stop
+ovs-appctl time/warp 2000 200
+
+# Send some traffic to distribute all the hashes between ports.
+AT_CHECK([SEND_TCP_BOND_PKTS([p5], [5], [65500])])
+
+# Wait for rebalancing for per-hash stats accounting.
+ovs-appctl time/warp 1000 100
+
+# Check that p2 handles some hashes.
+ovs-appctl bond/show > bond1.txt
+AT_CHECK([sed -n '/slave p2/,/^$/p' bond1.txt | grep 'hash'], [0], [ignore])
+
+# Move p2 down to force all hashes move to p1
+AT_CHECK([ovs-appctl netdev-dummy/set-admin-state p2 down], 0, [OK
+])
+
+ovs-appctl time/warp 200 100
+# Check that all hashes moved form p2
+ovs-appctl bond/show > bond2.txt
+AT_CHECK([sed -n '/slave p2/,/^$/p' bond2.txt | grep 'hash'], [1], [ignore])
+
+# Move p2 up
+AT_CHECK([ov

[ovs-dev] [PATCH 0/2] bond: Fix broken rebalancing after link state changes.

2017-07-20 Thread Ilya Maximets
See commit message in patch #1 for bug description.
Issue can be trigered by unit test from patch #2, but it requires
fix [1] for '--len' option to be applied to work correctly.

Actual fix in patch #1 doesn't have that dependency.

[1] [PATCH 0/3] Fix '--len' option for netdev-dummy/receive.
https://mail.openvswitch.org/pipermail/ovs-dev/2017-July/335930.html

Ilya Maximets (2):
  bond: Fix broken rebalancing after link state changes.
  ofproto-dpif.at: Add bonding down/up rebalancing test.

 ofproto/bond.c| 128 +++---
 tests/ofproto-dpif.at |  96 ++---
 2 files changed, 179 insertions(+), 45 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v4 1/3] dpif-netdev: Incremental addition/deletion of PMD threads.

2017-07-21 Thread Ilya Maximets
Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
Tested-by: Mark Kavanagh <mark.b.kavan...@intel.com>
Acked-by: Mark Kavanagh <mark.b.kavan...@intel.com>
---
 lib/dpif-netdev.c | 146 +++---
 tests/pmd.at  |   2 +-
 2 files changed, 108 insertions(+), 40 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 47a9fa0..4de3678 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -48,6 +48,7 @@
 #include "fat-rwlock.h"
 #include "flow.h"
 #include "hmapx.h"
+#include "id-pool.h"
 #include "latch.h"
 #include "netdev.h"
 #include "netdev-vport.h"
@@ -281,6 +282,9 @@ struct dp_netdev {
 
 /* Stores all 'struct dp_netdev_pmd_thread's. */
 struct cmap poll_threads;
+/* id pool for per thread static_tx_qid. */
+struct id_pool *tx_qid_pool;
+struct ovs_mutex tx_qid_pool_mutex;
 
 /* Protects the access of the 'struct dp_netdev_pmd_thread'
  * instance for non-pmd thread. */
@@ -567,7 +571,7 @@ struct dp_netdev_pmd_thread {
 /* Queue id used by this pmd thread to send packets on all netdevs if
  * XPS disabled for this netdev. All static_tx_qid's are unique and less
  * than 'cmap_count(dp->poll_threads)'. */
-const int static_tx_qid;
+uint32_t static_tx_qid;
 
 struct ovs_mutex port_mutex;/* Mutex for 'poll_list' and 'tx_ports'. */
 /* List of rx queues to poll. */
@@ -647,6 +651,8 @@ static struct dp_netdev_pmd_thread 
*dp_netdev_get_pmd(struct dp_netdev *dp,
   unsigned core_id);
 static struct dp_netdev_pmd_thread *
 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
+static void dp_netdev_del_pmd(struct dp_netdev *dp,
+  struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
@@ -1185,10 +1191,17 @@ create_dp_netdev(const char *name, const struct 
dpif_class *class,
 atomic_init(>emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
 
 cmap_init(>poll_threads);
+
+ovs_mutex_init(>tx_qid_pool_mutex);
+/* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
+dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
+
 ovs_mutex_init_recursive(>non_pmd_mutex);
 ovsthread_key_create(>per_pmd_key, NULL);
 
 ovs_mutex_lock(>port_mutex);
+/* non-PMD will be created before all other threads and will
+ * allocate static_tx_qid = 0. */
 dp_netdev_set_nonpmd(dp);
 
 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
@@ -1283,6 +1296,9 @@ dp_netdev_free(struct dp_netdev *dp)
 dp_netdev_destroy_all_pmds(dp, true);
 cmap_destroy(>poll_threads);
 
+ovs_mutex_destroy(>tx_qid_pool_mutex);
+id_pool_destroy(dp->tx_qid_pool);
+
 ovs_mutex_destroy(>non_pmd_mutex);
 ovsthread_key_delete(dp->per_pmd_key);
 
@@ -3303,12 +3319,29 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 }
 
 static void
+reload_affected_pmds(struct dp_netdev *dp)
+{
+struct dp_netdev_pmd_thread *pmd;
+
+CMAP_FOR_EACH (pmd, node, >poll_threads) {
+if (pmd->need_reload) {
+dp_netdev_reload_pmd__(pmd);
+pmd->need_reload = false;
+}
+}
+}
+
+static void
 reconfigure_pmd_threads(struct dp_netdev *dp)
 OVS_REQUIRES(dp->port_mutex)
 {
 struct dp_netdev_pmd_thread *pmd;
 struct ovs_numa_dump *pmd_cores;
+struct ovs_numa_info_core *core;
+struct hmapx to_delete = HMAPX_INITIALIZER(_delete);
+struct hmapx_node *node;
 bool changed = false;
+bool need_to_adjust_static_tx_qids = false;
 
 /* The pmd threads should be started only if there's a pmd port in the
  * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
@@ -3321,40 +3354,64 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
 }
 
-/* Check for changed configuration */
-if (ovs_numa_dump_count(pmd_cores) != cmap_count(>poll_threads) - 1) {
-changed = true;
-

[ovs-dev] [PATCH v4 3/3] dpif-netdev: Don't uninit emc on reload.

2017-07-21 Thread Ilya Maximets
There are many reasons for reloading of pmd threads:
* reconfiguration of one of the ports.
* Adjusting of static_tx_qid.
* Adding new tx/rx ports.

In many cases EMC is still useful after reload and uninit
will only lead to unnecessary upcalls/classifier lookups.

Such behaviour slows down the datapath. Uninit itself slows
down the reload path. All this factors leads to additional
unexpected latencies/drops on events not directly connected
to current PMD thread.

Lets not uninitialize emc cache on reload path.
'emc_cache_slow_sweep()' and replacements should free all
the old/unwanted entries.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Cian Ferriter <cian.ferri...@intel.com>
Tested-by: Cian Ferriter <cian.ferri...@intel.com>
---
 lib/dpif-netdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index a1e8c56..74d3535 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3810,9 +3810,9 @@ pmd_thread_main(void *f_)
 ovs_numa_thread_setaffinity_core(pmd->core_id);
 dpdk_set_lcore_id(pmd->core_id);
 poll_cnt = pmd_load_queues_and_ports(pmd, _list);
+emc_cache_init(>flow_cache);
 reload:
 pmd_alloc_static_tx_qid(pmd);
-emc_cache_init(>flow_cache);
 
 /* List port/core affinity */
 for (i = 0; i < poll_cnt; i++) {
@@ -3866,13 +3866,13 @@ reload:
  * reloading the updated configuration. */
 dp_netdev_pmd_reload_done(pmd);
 
-emc_cache_uninit(>flow_cache);
 pmd_free_static_tx_qid(pmd);
 
 if (!exiting) {
 goto reload;
 }
 
+emc_cache_uninit(>flow_cache);
 free(poll_list);
 pmd_free_cached_ports(pmd);
 return NULL;
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v4 2/3] dpif-netdev: Avoid port's reconfiguration on pmd-cpu-mask changes.

2017-07-21 Thread Ilya Maximets
Reconfiguration of HW NICs may lead to packet drops.
In current model all physical ports will be reconfigured each
time number of PMD threads changed. Since we not stopping
threads on pmd-cpu-mask changes, this patch will help to further
decrease port's downtime by setting the maximum possible number
of wanted tx queues to avoid unnecessary reconfigurations.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
Tested-by: Ian Stokes <ian.sto...@intel.com>
Acked-by: Ian Stokes <ian.sto...@intel.com>
---
 lib/dpif-netdev.c | 26 +-
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 4de3678..a1e8c56 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3457,7 +3457,7 @@ reconfigure_datapath(struct dp_netdev *dp)
 {
 struct dp_netdev_pmd_thread *pmd;
 struct dp_netdev_port *port;
-int wanted_txqs;
+int needed_txqs, wanted_txqs;
 
 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
 
@@ -3465,7 +3465,15 @@ reconfigure_datapath(struct dp_netdev *dp)
  * on the system and the user configuration. */
 reconfigure_pmd_threads(dp);
 
-wanted_txqs = cmap_count(>poll_threads);
+/* We need 1 Tx queue for each thread to avoid locking, but we will try
+ * to allocate the maximum possible value to minimize the number of port
+ * reconfigurations. */
+needed_txqs = cmap_count(>poll_threads);
+/* (n_cores + 1) is the maximum that we might need to have.
+ * Additional queue is for non-PMD threads. */
+wanted_txqs = ovs_numa_get_n_cores();
+ovs_assert(wanted_txqs != OVS_CORE_UNSPEC);
+wanted_txqs++;
 
 /* The number of pmd threads might have changed, or a port can be new:
  * adjust the txqs. */
@@ -3478,9 +3486,17 @@ reconfigure_datapath(struct dp_netdev *dp)
 
 /* Check for all the ports that need reconfiguration.  We cache this in
  * 'port->need_reconfigure', because netdev_is_reconf_required() can
- * change at any time. */
+ * change at any time.
+ * Also mark for reconfiguration all ports which will likely change their
+ * 'dynamic_txqs' parameter. It's required to stop using them before
+ * changing this setting and it's simpler to mark ports here and allow
+ * 'pmd_remove_stale_ports' to remove them from threads. There will be
+ * no actual reconfiguration in 'port_reconfigure' because it's
+ * unnecessary.  */
 HMAP_FOR_EACH (port, node, >ports) {
-if (netdev_is_reconf_required(port->netdev)) {
+if (netdev_is_reconf_required(port->netdev)
+|| (port->dynamic_txqs
+!= (netdev_n_txq(port->netdev) < needed_txqs))) {
 port->need_reconfigure = true;
 }
 }
@@ -3515,7 +3531,7 @@ reconfigure_datapath(struct dp_netdev *dp)
 seq_change(dp->port_seq);
 port_destroy(port);
 } else {
-port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
+port->dynamic_txqs = netdev_n_txq(port->netdev) < needed_txqs;
 }
 }
 
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [patch_v1 1/2] System Tests: Allow SNAT address variability retries.

2017-07-21 Thread Ilya Maximets
On 21.07.2017 05:40, Darrell Ball wrote:
> The discussion about the ‘Area’ prefix has come up again, even after Ben had 
> commented about it
> and after I had pointed folks to the submitting-patches.rst, which allows 
> flexibility in choosing an
> ‘Area’ prefix by the patch submitter.
> 
> In this thread, it was again suggested that the use of ‘Area’ prefix ‘System 
> Tests’ needs to change to ‘System-traffic’
> Below, I list some the history regarding previous commits of 
> system-traffic.at as a single patch.
> Different people have had different preferences and those seem to have been 
> tolerated in the past.
> 
> Hence, I would like know what has changed recently such that the documented
> (submitting-patches.rst) and historical flexibility (see previous patches) 
> regarding the
> ‘Area’ prefix is no longer tolerated ?
> 
> My suggestion is that we don’t continue along these new lines and rather stay 
> flexible, as the 
> work will be more productive in such an environment.
> 
> Darrell

Hi Darrell.

Looks like I should say something because I raised this issue.
First of all I want to say that it's my own opinion and you're
completely free to disagree with it, but I need to clarify my
position.

About system-traffic.at related patches:

You mentioned below commits which has 'tests' and 'system-tests'
prefixes. 'tests' is fine, because it is the folder name. Maybe
authors should be more specific with patches where only one file
changed, but it doesn't really matter.
From the other side 'system-tests' is not the name of file or
folder, it's the area. But if you'll go back to the history,
all of these patches was committed when where was only one file
responsible for system tests (one patch is an exception introduced
after appearing of system-ovn.at). So it was, actually, fine
at the time of submission.

Today we have at least 4 types of system tests and it'll be nice
to have more detailed information directly in subject instead of
looking to the patch itself.

I personally don't like capital letters in area, except for
cases where capital letter is in filename.

About datapaths:

Previously you mentioned that you're using area 'Userspace Datapath'
to be consistent with policies used for kernel and windows datapaths.
But this statement is not right because 'datapath' is the name
of folder and 'datapath-windows' is a name of folder too, but
there is no such folder for useerspace datapath.

Additionally, this mailing list is actually not the primary place
for reviewing patches for kernel datapath. They are here only for
information and backporting. 'datapath-windows' prefix needed to
filter patches targeted for windows because there are only few
persons who works on that and able to review and test.
So, the main areas for patches in this mailing list are general
management code, userspace actions and userspace datapath.
Userspace datapath contains too many files/modules to not
mention them in subject line. So, if you're submitting patch
with 'conntrack' prefix, everybody knows that it's all about
connection tracking in userspace. 

Beside all of that: isn't it a good habit to use most commonly
used prefixes like 'system-traffic' or 'conntrack' instead of
making the new one?
If everybody will use their own preferable prefixes, git history
will become a total mess. And that is the main concern.

Once again, It's only my opinion and you're free to disagree.

Best regards, Ilya Maximets.

> /
> 
> commit 9d3e0e5c196c0a91ea23d8d9254b1487cb58b58e
> Author: Jarno Rajahalme <ja...@ovn.org>
> Date:   Wed Mar 8 17:18:23 2017 -0800
> 
> tests: Add an FTP test without conntrack.
> 
> If FTP tests with conntrack fail, it is informative to know if the
> problem is with the FTP client and/or server, or with conntrack
> itself.
> 
> Signed-off-by: Jarno Rajahalme <ja...@ovn.org>
> Acked-by: Joe Stringer <j...@ovn.org>
> 
> 
> /
> 
> commit d0e4206230b31ab8dde44b6e8896c10b6317b1a8
> Author: Jarno Rajahalme <ja...@ovn.org>
> Date:   Fri Mar 10 16:10:41 2017 -0800
> 
> tests: ICMP related to original direction test.
> 
> Normally ICMP responses are in the reply direction of a conntrack
> entry.  This test exercises an ICMP response to the original direction
> of the conntrack entry.
> 
> Signed-off-by: Jarno Rajahalme <ja...@ovn.org>
> Acked-by: Joe Stringer j...@ovn.org
> 
> /
> 
> commit 2fa3e06d35988ee24ce1cc0f62ccceb3862038a1
> Author: Jarno Rajahalme <ja...@ovn.org>
> Date:   Wed Nov 25 16:04:59 2015 -0800
> 
> system-tests: Add IPv6 FTP system test.
> 
> Signed-off-by: Jarno Rajahalme <ja...@ovn.org>
> Acked-by: Joe Stringer <j...@ovn.o

[ovs-dev] [PATCH] bond: Unify hash functions in hash action and entry lookup.

2017-07-21 Thread Ilya Maximets
'lookup_bond_entry' currently uses 'flow_hash_symmetric_l4' while
OVS_ACTION_ATTR_HASH uses 'flow_hash_5tuple'. This may lead to
inconsistency in slave choosing for the new flows.  In general,
there is no point to unify hash functions, because it's not
required for correct work, but it's logically wrong to use
different hash functions there.

Unfortunately we're not able to use RSS hash here, because we have
no packet at this point, but we may reduce inconsistency by using
'flow_hash_5tuple' instead of 'flow_hash_symmetric_l4' because
symmetric quality is not needed.

'flow_hash_symmetric_l4' was used previously just because there
was no other implemented hash function at the moment. Now we
have 5tuple hash and may replace the old function.

'flow_hash_5tuple' is preferable solution because it in 2 - 8 times
(depending on the flow) faster than symmetric function.
So, this change will also speed up handling of the new flows and
statistics accounting.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 ofproto/bond.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/ofproto/bond.c b/ofproto/bond.c
index cb25a1d..72b373c 100644
--- a/ofproto/bond.c
+++ b/ofproto/bond.c
@@ -1746,12 +1746,10 @@ static unsigned int
 bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
 {
 struct flow hash_flow = *flow;
+
 hash_flow.vlans[0].tci = htons(vlan);
 
-/* The symmetric quality of this hash function is not required, but
- * flow_hash_symmetric_l4 already exists, and is sufficient for our
- * purposes, so we use it out of convenience. */
-return flow_hash_symmetric_l4(_flow, basis);
+return flow_hash_5tuple(_flow, basis);
 }
 
 static unsigned int
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] Fwd: [PATCH] bond: Unify hash functions in hash action and entry lookup.

2017-07-25 Thread Ilya Maximets
On 24.07.2017 22:53, Andy Zhou wrote:
> On Mon, Jul 24, 2017 at 9:23 AM, Ilya Maximets <i.maxim...@samsung.com> wrote:
>> On 23.07.2017 00:02, Darrell Ball wrote:
>>>
>>>
>>> -Original Message-
>>> From: <ovs-dev-boun...@openvswitch.org> on behalf of Andy Zhou 
>>> <az...@ovn.org>
>>> Date: Friday, July 21, 2017 at 2:17 PM
>>> To: "<d...@openvswitch.org>" <d...@openvswitch.org>
>>> Subject: [ovs-dev] Fwd: [PATCH] bond: Unify hash functions in hash action   
>>>   and entry lookup.
>>>
>>> Add dev mailing list. It got dropped by accident.
>>>
>>>
>>> -- Forwarded message --
>>>     From: Andy Zhou <az...@ovn.org>
>>> Date: Fri, Jul 21, 2017 at 2:14 PM
>>> Subject: Re: [PATCH] bond: Unify hash functions in hash action and 
>>> entry lookup.
>>> To: Ilya Maximets <i.maxim...@samsung.com>
>>>
>>>
>>> As it turns out, we can go even further:
>>>
>>> Notice that lookup_bond_entry() is only called with the code path of 
>>> BM_SLB.
>>> and bond_hash() is only called by lookup_bond_entry().
>>>
>>> I think we can just absorb the logic of lookup_bond_entry() into
>>> choose_output_slave()
>>> and remove bond_hash() all together.  What do you think?
>>>
>>>
>>> On Fri, Jul 21, 2017 at 1:06 PM, Andy Zhou <az...@ovn.org> wrote:
>>> > On Fri, Jul 21, 2017 at 6:28 AM, Ilya Maximets 
>>> <i.maxim...@samsung.com> wrote:
>>> >> 'lookup_bond_entry' currently uses 'flow_hash_symmetric_l4' while
>>> >> OVS_ACTION_ATTR_HASH uses 'flow_hash_5tuple'. This may lead to
>>> >> inconsistency in slave choosing for the new flows.  In general,
>>> >> there is no point to unify hash functions, because it's not
>>> >> required for correct work, but it's logically wrong to use
>>> >> different hash functions there.
>>> >>
>>> >> Unfortunately we're not able to use RSS hash here, because we have
>>> >> no packet at this point, but we may reduce inconsistency by using
>>> >> 'flow_hash_5tuple' instead of 'flow_hash_symmetric_l4' because
>>> >> symmetric quality is not needed.
>>> >>
>>> >> 'flow_hash_symmetric_l4' was used previously just because there
>>> >> was no other implemented hash function at the moment. Now we
>>> >> have 5tuple hash and may replace the old function.
>>>
>>> [Darrell]
>>>
>>> What other load balance option is available to do load balancing of L2 
>>> packets (non-IP)
>>> ‘at the same time’ as IPv4/6 packets for bonds ?
>>> Unless there is another, I am not sure giving up the load balancing of L2 
>>> packets is desirable.
>>> There would be a loss of feature functionality with this patch.
>>>
>>> A bond at a gateway (one of the most common use cases) could handle many CFM
>>> sessions, for example and dropping L2 fields from the hash sends all L2 
>>> packets to a
>>> single interface of a bond (single point of failure).
>>> The algorithm flow_hash_symmetric_l4 includes L2 fields (macs and vlans)
>>> in addition to IPv4/6 and L4 fields, which means it can load balance L2 
>>> packets (eg CFM)
>>> in addition to IPv4/6 packets.
>>>
>>> We have documented that L2 load balancing is included in balance-tcp, which 
>>> at the very
>>> least would need to change, assuming we thought such a change had more 
>>> advantages than disadvantages.
>>>
>>> http://openvswitch.org/support/dist-docs/ovs-vswitchd.conf.db.5.pdf
>>>
>>> “The following modes require the upstream switch to support 802.3ad with 
>>> successful LACP negotiation. If
>>> LACP negotiation fails and other-config:lacp-fallback-ab is true, then 
>>> active−backup mode is used:
>>>
>>>balance−tcp
>>> Balances flows among slaves based on L2, L3, and L4 
>>> protocol information such as destination
>>> MAC address, IP address, and TCP port.”
>>>
>>> What is the overall time cost savings in the scope of the whole code 
>>> pipeline for flow creation, not
>>> just the hash function itself (as mentioned in the commit message

[ovs-dev] [PATCH v2 2/3] bond: Unify hash functions in hash action and entry lookup.

2017-07-25 Thread Ilya Maximets
'lookup_bond_entry' currently uses 'flow_hash_symmetric_l4' while
OVS_ACTION_ATTR_HASH uses 'flow_hash_5tuple'. This may lead to
inconsistency in slave choosing for the new flows.  In general,
there is no point to unify hash functions, because it's not
required for correct work, but it's logically wrong to use
different hash functions there.

Unfortunately we're not able to use RSS hash here, because we have
no packet at this point, but we may reduce inconsistency by using
'flow_hash_5tuple' instead of 'flow_hash_symmetric_l4' because
symmetric quality is not needed.

'flow_hash_symmetric_l4' was used previously just because there
was no other implemented hash function at the moment and L2
fields was additionally involved in hash calculation. Now we
have 5tuple hash and L2 not used anymore, so, we may replace the
old function.

'flow_hash_5tuple' is preferable solution because it in 2 - 8 times
(depending on the flow) faster than symmetric function.
So, this change will also speed up handling of the new flows and
statistics accounting.

Additionally function 'bond_hash_tcp()' was removed for the reasons
of code simplification and possible additional speed up.

Co-authored-by: Andy Zhou <az...@ovn.org>
Signed-off-by: Andy Zhou <az...@ovn.org>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 ofproto/bond.c | 16 +---
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/ofproto/bond.c b/ofproto/bond.c
index cb25a1d..e4d4b65 100644
--- a/ofproto/bond.c
+++ b/ofproto/bond.c
@@ -177,8 +177,6 @@ static void bond_choose_active_slave(struct bond *)
 OVS_REQ_WRLOCK(rwlock);
 static unsigned int bond_hash_src(const struct eth_addr mac,
   uint16_t vlan, uint32_t basis);
-static unsigned int bond_hash_tcp(const struct flow *, uint16_t vlan,
-  uint32_t basis);
 static struct bond_entry *lookup_bond_entry(const struct bond *,
 const struct flow *,
 uint16_t vlan)
@@ -1743,24 +1741,12 @@ bond_hash_src(const struct eth_addr mac, uint16_t vlan, 
uint32_t basis)
 }
 
 static unsigned int
-bond_hash_tcp(const struct flow *flow, uint16_t vlan, uint32_t basis)
-{
-struct flow hash_flow = *flow;
-hash_flow.vlans[0].tci = htons(vlan);
-
-/* The symmetric quality of this hash function is not required, but
- * flow_hash_symmetric_l4 already exists, and is sufficient for our
- * purposes, so we use it out of convenience. */
-return flow_hash_symmetric_l4(_flow, basis);
-}
-
-static unsigned int
 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
 {
 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
 
 return (bond->balance == BM_TCP
-? bond_hash_tcp(flow, vlan, bond->basis)
+? flow_hash_5tuple(flow, bond->basis)
 : bond_hash_src(flow->dl_src, vlan, bond->basis));
 }
 
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2 3/3] bond: Remove bond_hash_src.

2017-07-25 Thread Ilya Maximets
Since introduction of 'hash_mac()' function in
commit 7e36ac42e33a ("lib/packet.h: add hash_mac()"), there is no
need to have additional wrapper for mac address hashing.

Lets use 'hash_mac()' directly and remove 'bond_hash_src()' to
simplify the code.

Suggested-by: Andy Zhou <az...@ovn.org>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 ofproto/bond.c | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/ofproto/bond.c b/ofproto/bond.c
index e4d4b65..e09136e 100644
--- a/ofproto/bond.c
+++ b/ofproto/bond.c
@@ -175,8 +175,6 @@ static void bond_link_status_update(struct bond_slave *)
 OVS_REQ_WRLOCK(rwlock);
 static void bond_choose_active_slave(struct bond *)
 OVS_REQ_WRLOCK(rwlock);
-static unsigned int bond_hash_src(const struct eth_addr mac,
-  uint16_t vlan, uint32_t basis);
 static struct bond_entry *lookup_bond_entry(const struct bond *,
 const struct flow *,
 uint16_t vlan)
@@ -1615,7 +1613,7 @@ bond_unixctl_hash(struct unixctl_conn *conn, int argc, 
const char *argv[],
 }
 
 if (ovs_scan(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
-hash = bond_hash_src(mac, vlan, basis) & BOND_MASK;
+hash = hash_mac(mac, vlan, basis) & BOND_MASK;
 
 hash_cstr = xasprintf("%u", hash);
 unixctl_command_reply(conn, hash_cstr);
@@ -1735,19 +1733,13 @@ bond_link_status_update(struct bond_slave *slave)
 }
 
 static unsigned int
-bond_hash_src(const struct eth_addr mac, uint16_t vlan, uint32_t basis)
-{
-return hash_mac(mac, vlan, basis);
-}
-
-static unsigned int
 bond_hash(const struct bond *bond, const struct flow *flow, uint16_t vlan)
 {
 ovs_assert(bond->balance == BM_TCP || bond->balance == BM_SLB);
 
 return (bond->balance == BM_TCP
 ? flow_hash_5tuple(flow, bond->basis)
-: bond_hash_src(flow->dl_src, vlan, bond->basis));
+: hash_mac(flow->dl_src, vlan, bond->basis));
 }
 
 static struct bond_entry *
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2 1/3] vswitch.xml: Fix L2 balancing mentioning for balance-tcp bond.

2017-07-25 Thread Ilya Maximets
L2 fields are not used in userspace hash action since
commit 4f150744921f ("dpif-netdev: Use miniflow as a flow key.").
In kernel datapath RSS (which is not include L2 by default for
most of the NICs) was used from the beginning. This means that
if recirculation is in use, L2 fields are not used for flow
balancing.

Fix the documentation accordingly.

Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---

I think, this should be applied to some stable branches too.

 vswitchd/vswitch.xml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 883ecd8..074535b 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -1569,9 +1569,8 @@
   
 balance-tcp
 
-  Balances flows among slaves based on L2, L3, and L4 protocol
-  information such as destination MAC address, IP address, and TCP
-  port.
+  Balances flows among slaves based on L3 and L4 protocol information
+  such as IP addresses and TCP/UDP ports.
 
   
 
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2 0/3] bond: Clean up hash functions.

2017-07-25 Thread Ilya Maximets
Version 2:
* Became a patch-set.
  v1: 
https://mail.openvswitch.org/pipermail/ovs-dev/2017-July/336078.html
* Removed bond_hash_tcp.
* 2 new patches added:
* Documentation fix.
* Remove bond_hash_src.

Ilya Maximets (3):
  vswitch.xml: Fix L2 balancing mentioning for balance-tcp bond.
  bond: Unify hash functions in hash action and entry lookup.
  bond: Remove bond_hash_src.

 ofproto/bond.c   | 28 +++-
 vswitchd/vswitch.xml |  5 ++---
 2 files changed, 5 insertions(+), 28 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH 1/3] flow: Add packet_size option to flow_compose.

2017-07-25 Thread Ilya Maximets
On 24.07.2017 22:40, Andy Zhou wrote:
> On Mon, Jul 24, 2017 at 6:33 AM, Ilya Maximets <i.maxim...@samsung.com> wrote:
>> On 22.07.2017 01:38, Andy Zhou wrote:
>>> On Wed, Jul 19, 2017 at 7:51 AM, Ilya Maximets <i.maxim...@samsung.com> 
>>> wrote:
>>>> This allows to compose packets with different real lenghts from
>>>> odp flows i.e. memory will be allocated for requested packet
>>>> size and all required headers like ip->tot_len filled correctly.
>>>>
>>>> Will be used in netdev-dummy to properly handle '--len' option.
>>>>
>>>> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
>>>
>>> Thank you for working on this.  Although those functions are mainly
>>> for testing, it is still good that we improve them.
>>>
>>> I am wondering about a slightly different approach. Instead of adding
>>> 'packet_size' to the flow_compose() interface, would it make
>>> sense to come up with a new function whose task is to
>>> expand a packet to the final length, (similar to flow_compose_l4_csum())
>>>
>>> We would first create the necessary headers for all layers based on
>>> flow, without fill in the actual size related field or compute checksums.
>>>
>>> Then the fix size function will take over, fill in data, and
>>> update various headers.
>>>
>>> Then checksums can be computed and filled in.
>>>
>>> I think the logics will be easier to follow with this approach. What
>>> do you think?
>>
>>
>> I thought about this. I just tried to avoid double packet parsing,
>> but such approach could be interesting.
> 
> This approach looks fine to me.
>>
>> Below is the possible implementation.
>> If you think that it's better than the modification of flow_compose(),
>> I can send v2 with below implementation:
> 
> I don't think that eth_from_flow() adds much value. Would it
> be less clear if we just use flow_compose_xxx() APIs ?

String to flow parsing code complicates receive function.
I don't see a beautiful solution right now.
I'll send v2 with netdev-dummy changes as below.


> Please go ahead with V2. Looking forward to it.
> 
>>
>> --8<--->8--
>> diff --git a/lib/flow.c b/lib/flow.c
>> index e1597fa..ce99c06 100644
>> --- a/lib/flow.c
>> +++ b/lib/flow.c
>> @@ -2706,40 +2706,87 @@ flow_compose_l4_csum(struct dp_packet *p, const 
>> struct flow *flow,
>>  if (flow->nw_proto == IPPROTO_TCP) {
>>  struct tcp_header *tcp = dp_packet_l4(p);
>>
>> -/* Checksum has already been zeroed by put_zeros call in
>> - * flow_compose_l4(). */
>> +tcp->tcp_csum = 0;
>>  tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum,
>>tcp, l4_len));
>>  } else if (flow->nw_proto == IPPROTO_UDP) {
>>  struct udp_header *udp = dp_packet_l4(p);
>>
>> -/* Checksum has already been zeroed by put_zeros call in
>> - * flow_compose_l4(). */
>> +udp->udp_csum = 0;
>>  udp->udp_csum = csum_finish(csum_continue(pseudo_hdr_csum,
>>udp, l4_len));
>>  } else if (flow->nw_proto == IPPROTO_ICMP) {
>>  struct icmp_header *icmp = dp_packet_l4(p);
>>
>> -/* Checksum has already been zeroed by put_zeros call in
>> - * flow_compose_l4(). */
>> +icmp->icmp_csum = 0;
>>  icmp->icmp_csum = csum(icmp, l4_len);
>>  } else if (flow->nw_proto == IPPROTO_IGMP) {
>>  struct igmp_header *igmp = dp_packet_l4(p);
>>
>> -/* Checksum has already been zeroed by put_zeros call in
>> - * flow_compose_l4(). */
>> +igmp->igmp_csum = 0;
>>  igmp->igmp_csum = csum(igmp, l4_len);
>>  } else if (flow->nw_proto == IPPROTO_ICMPV6) {
>>  struct icmp6_hdr *icmp = dp_packet_l4(p);
>>
>> -/* Checksum has already been zeroed by put_zeros call in
>> - * flow_compose_l4(). */
>> +icmp->icmp6_cksum = 0;
>>  icmp->icmp6_cksum = (OVS_FORCE uint16_t)
>>  csum_finish(csum_continue(pseudo_hdr_csum, icmp, l4_len));
>>  }
>>  }
>>  }
>>
>> +/* Tri

  1   2   3   4   5   6   7   8   9   10   >