[ovs-dev] [PATCH v9 02/15] dpif-netdev: Remove unused 'index' in dp_netdev_pmd_thread.

2016-04-22 Thread Daniele Di Proietto
Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 24717cc..060f5e0 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -430,8 +430,6 @@ struct dp_netdev_pmd_thread {
 struct latch exit_latch;/* For terminating the pmd thread. */
 atomic_uint change_seq; /* For reloading pmd ports. */
 pthread_t thread;
-int index;  /* Idx of this pmd thread among pmd*/
-/* threads on same numa node. */
 unsigned core_id;   /* CPU core id of this pmd thread. */
 int numa_id;/* numa node id of this pmd thread. */
 atomic_int tx_qid;  /* Queue id used by this pmd thread to
@@ -485,8 +483,8 @@ static void dp_netdev_recirculate(struct 
dp_netdev_pmd_thread *,
 static void dp_netdev_disable_upcall(struct dp_netdev *);
 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
-struct dp_netdev *dp, int index,
-unsigned core_id, int numa_id);
+struct dp_netdev *dp, unsigned core_id,
+int numa_id);
 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_set_nonpmd(struct dp_netdev *dp);
 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
@@ -2787,8 +2785,7 @@ dp_netdev_set_nonpmd(struct dp_netdev *dp)
 struct dp_netdev_pmd_thread *non_pmd;
 
 non_pmd = xzalloc(sizeof *non_pmd);
-dp_netdev_configure_pmd(non_pmd, dp, 0, NON_PMD_CORE_ID,
-OVS_NUMA_UNSPEC);
+dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
 }
 
 /* Caller must have valid pointer to 'pmd'. */
@@ -2829,10 +2826,9 @@ dp_netdev_pmd_get_next(struct dp_netdev *dp, struct 
cmap_position *pos)
 /* Configures the 'pmd' based on the input argument. */
 static void
 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
-int index, unsigned core_id, int numa_id)
+unsigned core_id, int numa_id)
 {
 pmd->dp = dp;
-pmd->index = index;
 pmd->core_id = core_id;
 pmd->numa_id = numa_id;
 pmd->poll_cnt = 0;
@@ -3140,7 +3136,7 @@ dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int 
numa_id)
 for (i = 0; i < can_have; i++) {
 unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
 pmds[i] = xzalloc(sizeof **pmds);
-dp_netdev_configure_pmd(pmds[i], dp, i, core_id, numa_id);
+dp_netdev_configure_pmd(pmds[i], dp, core_id, numa_id);
 }
 
 /* Distributes rx queues of this numa node between new pmd threads. */
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v9 03/15] dpif-netdev: Factor out port_create() from do_add_port().

2016-04-22 Thread Daniele Di Proietto
Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 69 ++-
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 060f5e0..a224b43 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1095,29 +1095,22 @@ hash_port_no(odp_port_t port_no)
 }
 
 static int
-do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
-odp_port_t port_no)
-OVS_REQUIRES(dp->port_mutex)
+port_create(const char *devname, const char *open_type, const char *type,
+odp_port_t port_no, struct dp_netdev_port **portp)
 {
 struct netdev_saved_flags *sf;
 struct dp_netdev_port *port;
-struct netdev *netdev;
 enum netdev_flags flags;
-const char *open_type;
-int error = 0;
-int i, n_open_rxqs = 0;
+struct netdev *netdev;
+int n_open_rxqs = 0;
+int i, error;
 
-/* Reject devices already in 'dp'. */
-if (!get_port_by_name(dp, devname, )) {
-error = EEXIST;
-goto out;
-}
+*portp = NULL;
 
 /* Open and validate network device. */
-open_type = dpif_netdev_port_open_type(dp->class, type);
 error = netdev_open(devname, open_type, );
 if (error) {
-goto out;
+return error;
 }
 /* XXX reject non-Ethernet devices */
 
@@ -1125,7 +1118,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
 if (flags & NETDEV_LOOPBACK) {
 VLOG_ERR("%s: cannot add a loopback device", devname);
 error = EINVAL;
-goto out_close;
+goto out;
 }
 
 if (netdev_is_pmd(netdev)) {
@@ -1134,7 +1127,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
 if (n_cores == OVS_CORE_UNSPEC) {
 VLOG_ERR("%s, cannot get cpu core info", devname);
 error = ENOENT;
-goto out_close;
+goto out;
 }
 /* There can only be ovs_numa_get_n_cores() pmd threads,
  * so creates a txq for each, and one extra for the non
@@ -1143,14 +1136,14 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
   netdev_requested_n_rxq(netdev));
 if (error && (error != EOPNOTSUPP)) {
 VLOG_ERR("%s, cannot set multiq", devname);
-goto out_close;
+goto out;
 }
 }
 port = xzalloc(sizeof *port);
 port->port_no = port_no;
 port->netdev = netdev;
 port->n_rxq = netdev_n_rxq(netdev);
-port->rxq = xmalloc(sizeof *port->rxq * port->n_rxq);
+port->rxq = xcalloc(port->n_rxq, sizeof *port->rxq);
 port->type = xstrdup(type);
 port->latest_requested_n_rxq = netdev_requested_n_rxq(netdev);
 
@@ -1170,12 +1163,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
 }
 port->sf = sf;
 
-cmap_insert(>ports, >node, hash_port_no(port_no));
-
-if (netdev_is_pmd(netdev)) {
-dp_netdev_add_port_to_pmds(dp, port);
-}
-seq_change(dp->port_seq);
+*portp = port;
 
 return 0;
 
@@ -1186,13 +1174,42 @@ out_rxq_close:
 free(port->type);
 free(port->rxq);
 free(port);
-out_close:
-netdev_close(netdev);
+
 out:
+netdev_close(netdev);
 return error;
 }
 
 static int
+do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
+odp_port_t port_no)
+OVS_REQUIRES(dp->port_mutex)
+{
+struct dp_netdev_port *port;
+int error;
+
+/* Reject devices already in 'dp'. */
+if (!get_port_by_name(dp, devname, )) {
+return EEXIST;
+}
+
+error = port_create(devname, dpif_netdev_port_open_type(dp->class, type),
+type, port_no, );
+if (error) {
+return error;
+}
+
+cmap_insert(>ports, >node, hash_port_no(port_no));
+
+if (netdev_is_pmd(port->netdev)) {
+dp_netdev_add_port_to_pmds(dp, port);
+}
+seq_change(dp->port_seq);
+
+return 0;
+}
+
+static int
 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
  odp_port_t *port_nop)
 {
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v9 01/15] dpif-netdev: Destroy 'port_mutex' in dp_netdev_free().

2016-04-22 Thread Daniele Di Proietto
Found by inspection.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 1e8a37c..24717cc 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -986,6 +986,7 @@ dp_netdev_free(struct dp_netdev *dp)
 
 seq_destroy(dp->port_seq);
 cmap_destroy(>ports);
+ovs_mutex_destroy(>port_mutex);
 
 /* Upcalls must be disabled at this point */
 dp_netdev_destroy_upcall_lock(dp);
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v9 05/15] dpif-netdev: Fix race condition in pmd thread initialization.

2016-04-22 Thread Daniele Di Proietto
The pmds and the main threads are synchronized using a condition
variable.  The main thread writes a new configuration, then it waits on
the condition variable.  A pmd thread reads the new configuration, then
it calls signal() on the condition variable. To make sure that the pmds
and the main thread have a consistent view, each signal() should be
backed by a wait().

Currently the first signal() doesn't have a corresponding wait().  If
the pmd thread takes a long time to start and the signal() is received
by a later wait, the threads will have an inconsistent view.

The commit fixes the problem by removing the first signal() from the
pmd thread.

This is hardly a problem on current master, because the main thread
will call the first wait() a long time after the creation of a pmd
thread.  It becomes a problem with the next commits.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c3be4eb..fbd23cf 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2651,21 +2651,22 @@ dpif_netdev_wait(struct dpif *dpif)
 
 static int
 pmd_load_queues(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **ppoll_list)
-OVS_REQUIRES(pmd->poll_mutex)
 {
 struct rxq_poll *poll_list = *ppoll_list;
 struct rxq_poll *poll;
 int i;
 
+ovs_mutex_lock(>poll_mutex);
 poll_list = xrealloc(poll_list, pmd->poll_cnt * sizeof *poll_list);
 
 i = 0;
 LIST_FOR_EACH (poll, node, >poll_list) {
 poll_list[i++] = *poll;
 }
+ovs_mutex_unlock(>poll_mutex);
 
 *ppoll_list = poll_list;
-return pmd->poll_cnt;
+return i;
 }
 
 static void *
@@ -2675,6 +2676,7 @@ pmd_thread_main(void *f_)
 unsigned int lc = 0;
 struct rxq_poll *poll_list;
 unsigned int port_seq = PMD_INITIAL_SEQ;
+bool exiting;
 int poll_cnt;
 int i;
 
@@ -2684,13 +2686,10 @@ pmd_thread_main(void *f_)
 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
 pmd_thread_setaffinity_cpu(pmd->core_id);
+poll_cnt = pmd_load_queues(pmd, _list);
 reload:
 emc_cache_init(>flow_cache);
 
-ovs_mutex_lock(>poll_mutex);
-poll_cnt = pmd_load_queues(pmd, _list);
-ovs_mutex_unlock(>poll_mutex);
-
 /* List port/core affinity */
 for (i = 0; i < poll_cnt; i++) {
VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
@@ -2698,10 +2697,6 @@ reload:
 netdev_rxq_get_queue_id(poll_list[i].rx));
 }
 
-/* Signal here to make sure the pmd finishes
- * reloading the updated configuration. */
-dp_netdev_pmd_reload_done(pmd);
-
 for (;;) {
 for (i = 0; i < poll_cnt; i++) {
 dp_netdev_process_rxq_port(pmd, poll_list[i].port, 
poll_list[i].rx);
@@ -2724,14 +2719,18 @@ reload:
 }
 }
 
+poll_cnt = pmd_load_queues(pmd, _list);
+exiting = latch_is_set(>exit_latch);
+/* Signal here to make sure the pmd finishes
+ * reloading the updated configuration. */
+dp_netdev_pmd_reload_done(pmd);
+
 emc_cache_uninit(>flow_cache);
 
-if (!latch_is_set(>exit_latch)){
+if (!exiting) {
 goto reload;
 }
 
-dp_netdev_pmd_reload_done(pmd);
-
 free(poll_list);
 return NULL;
 }
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v9 00/15] Reconfigure netdev at runtime

2016-04-22 Thread Daniele Di Proietto
Currently we treat set_multiq() calls specially in netdev and dpif-netdev:
every pmd thread must be stopped and set_multiq() is allowed to destroy and
recreate the device.

I think we can improve this by:
* Generalizing the mechanism to allow changing other parameters at runtime
  (such as MTU).
* Involving less the above layer (dpif-netdev).  The request for changes
  often comes from below (netdev_dpdk_set_config(), or the vhost new_device()
  callback).  There's no need for dpif-netdev to remember the requested value,
  all that it needs to know is that a configuration change is requested.

This series implements exactly this: a mechanism to allow a netdev provider
to request configuration changes, to which dpif-netdev will respond by
stopping rx/tx and calling a netdev function to appy the new configuration.

The new mechanism is used in this series to replace the set_multiq() call,
but the idea is to use it also at least for:

* Changing the MTU at runtime
* Automatically detecting the number of rx queues for a vhost-user device
* Move a DPDK vhost device to the proper NUMA socket

The first commits refactor some code in dpif-netdev and, most importantly
avoid using RCU for ports.  Each thread will have its local copy of all the
ports in the datapath.

The series is also available here:

https://github.com/ddiproietto/ovs/tree/configchangesv9

v9:
* Fix HMAP_FOR_EACH_POP: now it's O(n) in the number of buckets
* Avoid using & in clang thread safety annotations
* Fix for non pmd devices: dp_netdev_set_pmds_on_numa() and
  dp_netdev_set_nonpmd() now add the ports to the pmd local cache.
* Merged patch "dpif-netdev: Remove duplicate code in
  dp_netdev_set_pmds_on_numa()." with "dpif-netdev: Add pmd thread
  local port cache for transmission."

v8:
* Update comment in rcu.h: ovs_mutex_cond_wait doesn't quiesce.
* Change 'set_multiq' to 'set_tx_multiq'.
* Added documentation in comments and commit messages explaining thread local
  port cache.
* Fixed style issues reported by checkpatch.py.
* Fixed race condition when deleting pmd thread.

v7:
* Dropped already applied patches.
* Stop using RCU for ports.
* Rebased against master.

v6:
* Rebased against master.
* Check return value of netdev_rxq_open().
* Fix comment.

v5:
* Style fixes.
* Fixed a bug in dp_netdev_free() in patch 6.

v4:
* Added another patch to uniform names of variables in netdev-dpdk (no
  functional change)
* Update some netdev comments to document the relation between
  netdev_set_multiq() and netdev_reconfigure()
* Clarify that when netdev_reconfigure() is called no call to netdev_send()
  or netdev_rxq_recv() must be issued.
* Move check to skip reconfiguration in netdev_dpdk_reconfigure() before
  rte_eth_dev_stop().

v3:
* Fixed another outdated comment about rx queue configuration, as pointed out
  by Mark
* Removed unnecessary and buggy initialization of requested_n_rxq in
  reconfigure_pmd_threads().
* Removed unused 'err' variable in netdev_dpdk_set_multiq().
* Changed comparison in netdev_set_multiq() to use previous
  'netdev->requested_n_txq' instead of 'netdev->up.n_txq'
* Return immediately in netdev_dpdk_reconfigure() if configuration didn't
  change anything.

v2:
* Fixed do_add_port(): we have to call netdev_reconfigure() before opening
  the rxqs.  This prevents memory leaks, and makes sure that the datapath
  polls the appropriate number of queues
* Fixed netdev_dpdk_vhost_set_multiq(): it must call
  netdev_request_reconfigure(). Since it is now equal to
  netdev_dpdk_set_multiq(), the two function have been merged.
* Fixed netdev_dpdk_set_config(): dev->requested_n_rxq is now accessed
  while holding the appropriate mutex.
* Fixed some outdated comments about rx queue configuration.


Daniele Di Proietto (15):
  dpif-netdev: Destroy 'port_mutex' in dp_netdev_free().
  dpif-netdev: Remove unused 'index' in dp_netdev_pmd_thread.
  dpif-netdev: Factor out port_create() from do_add_port().
  dpif-netdev: Add functions to modify rxq without reloading pmd
threads.
  dpif-netdev: Fix race condition in pmd thread initialization.
  hmap: Add HMAP_FOR_EACH_POP.
  dpif-netdev: Add pmd thread local port cache for transmission.
  hmap: Use struct for hmap_at_position().
  dpif-netdev: Use hmap for ports.
  ovs-thread: Do not quiesce in ovs_mutex_cond_wait().
  ofproto-dpif: Call dpif_poll_threads_set() before dpif_run().
  dpif-netdev: Change pmd thread configuration in dpif_netdev_run().
  dpif-netdev: Handle errors in reconfigure_pmd_threads().
  netdev: Add reconfigure request mechanism.
  netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

 lib/cfm.c|   5 +-
 lib/dpif-netdev.c| 756 ---
 lib/dpif-provider.h  |   3 +-
 lib/hmap.c   |  26 +-
 lib/hmap.h   |  27 +-
 lib/id-pool.c|   5 +-
 lib/learning-switch.c|   5 +-
 lib/netdev-bsd.c

Re: [ovs-dev] [PATCH] FAQ: Add entry for OVS/DPDK version dependencies.

2016-04-21 Thread Daniele Di Proietto
Thanks for writing this up, applied to master!

2016-04-19 3:35 GMT-07:00 Kevin Traynor :

> For a given release this is listed in the INSTALL.DPDK.md
> but it gets asked quite a bit on the mailing list, so create
> a table in the FAQ.
>
> Signed-off-by: Kevin Traynor 
> ---
>  FAQ.md |   13 +
>  1 files changed, 13 insertions(+), 0 deletions(-)
>
> diff --git a/FAQ.md b/FAQ.md
> index 0fee992..5777471 100644
> --- a/FAQ.md
> +++ b/FAQ.md
> @@ -233,6 +233,19 @@ Validate flow actions |  YES   |   YES
>   |N/A|   NO|
>  Multiple datapaths|  YES   |   YES  |YES|
>  NO|
>  Tunnel TSO - STT  |  N/A   |   YES  |NO |
>  YES   |
>
> +### Q: What DPDK version does each Open vSwitch release work with?
> +
> +A: The following table lists the DPDK version against which the
> +   given versions of Open vSwitch will successfully build.
> +
> +| Open vSwitch | DPDK
> +|::|:-:
> +|2.2.x | 1.6
> +|2.3.x | 1.6
> +|2.4.x | 2.0
> +|2.5.x | 2.2
> +|2.6.x | 16.04
> +
>  ### Q: I get an error like this when I configure Open vSwitch:
>
> configure: error: Linux kernel in  is version , but
> --
> 1.7.4.1
>
> ___
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
>
___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH 2/2] system-traffic: Add basic geneve tunnel sanity test.

2016-04-21 Thread Daniele Di Proietto
Thanks for adding this tests!

Acked-by: Daniele Di Proietto <diproiet...@vmware.com>


On 20/04/2016 16:07, "Joe Stringer" <j...@ovn.org> wrote:

>Signed-off-by: Joe Stringer <j...@ovn.org>
>---
> tests/system-common-macros.at |  4 
> tests/system-traffic.at   | 41
>+
> 2 files changed, 45 insertions(+)
>
>diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at
>index 1b9b5c1e9f15..2116f1e31357 100644
>--- a/tests/system-common-macros.at
>+++ b/tests/system-common-macros.at
>@@ -159,3 +159,7 @@ m4_define([OVS_CHECK_VXLAN],
> # OVS_CHECK_GRE()
> m4_define([OVS_CHECK_GRE],
> [AT_SKIP_IF([! ip link add foo type gretap help 2>&1 | grep gre
>>/dev/null])])
>+
>+# OVS_CHECK_GENEVE()
>+m4_define([OVS_CHECK_GENEVE],
>+[AT_SKIP_IF([! ip link add foo type geneve help 2>&1 | grep geneve
>>/dev/null])])
>diff --git a/tests/system-traffic.at b/tests/system-traffic.at
>index 8684a5f06c68..a3d93e92c887 100644
>--- a/tests/system-traffic.at
>+++ b/tests/system-traffic.at
>@@ -188,6 +188,47 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3
>-w 2 10.1.1.100 | FORMAT_PI
> OVS_TRAFFIC_VSWITCHD_STOP
> AT_CLEANUP
> 
>+AT_SETUP([datapath - ping over geneve tunnel])
>+OVS_CHECK_GENEVE()
>+
>+OVS_TRAFFIC_VSWITCHD_START()
>+ADD_BR([br-underlay])
>+
>+AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"])
>+AT_CHECK([ovs-ofctl add-flow br-underlay "actions=normal"])
>+
>+ADD_NAMESPACES(at_ns0)
>+
>+dnl Set up underlay link from host into the namespace using veth pair.
>+ADD_VETH(p0, at_ns0, br-underlay, "172.31.1.1/24")
>+AT_CHECK([ip addr add dev br-underlay "172.31.1.100/24"])
>+AT_CHECK([ip link set dev br-underlay up])
>+
>+dnl Set up tunnel endpoints on OVS outside the namespace and with a
>native
>+dnl linux device inside the namespace.
>+ADD_OVS_TUNNEL([geneve], [br0], [at_gnv0], [172.31.1.1], [10.1.1.100/24])
>+ADD_NATIVE_TUNNEL([geneve], [ns_gnv0], [at_ns0], [172.31.1.100],
>[10.1.1.1/24],
>+  [vni 0])
>+
>+dnl First, check the underlay
>+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 |
>FORMAT_PING], [0], [dnl
>+3 packets transmitted, 3 received, 0% packet loss, time 0ms
>+])
>+
>+dnl Okay, now check the overlay with different packet sizes
>+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 |
>FORMAT_PING], [0], [dnl
>+3 packets transmitted, 3 received, 0% packet loss, time 0ms
>+])
>+NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 |
>FORMAT_PING], [0], [dnl
>+3 packets transmitted, 3 received, 0% packet loss, time 0ms
>+])
>+NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 |
>FORMAT_PING], [0], [dnl
>+3 packets transmitted, 3 received, 0% packet loss, time 0ms
>+])
>+
>+OVS_TRAFFIC_VSWITCHD_STOP
>+AT_CLEANUP
>+
> AT_SETUP([conntrack - controller])
> CHECK_CONNTRACK()
> OVS_TRAFFIC_VSWITCHD_START()
>-- 
>2.1.4
>

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH 1/2] system-traffic: Add basic gre tunnel sanity test.

2016-04-21 Thread Daniele Di Proietto
Acked-by: Daniele Di Proietto <diproiet...@vmware.com>


On 21/04/2016 13:29, "Joe Stringer" <j...@ovn.org> wrote:

>On 20 April 2016 at 16:07, Joe Stringer <j...@ovn.org> wrote:
>> Signed-off-by: Joe Stringer <j...@ovn.org>
>> ---
>
>
>> +dnl Set up tunnel endpoints on OVS outside the namespace and with a
>>native
>> +dnl linux device inside the namespace.
>> +ADD_OVS_TUNNEL([gre], [br0], [at_gre0], [172.31.1.1], [10.1.1.100/24])
>> +ADD_NATIVE_TUNNEL([gretap], [ns_gre0], [at_ns0], [172.31.1.100],
>>[10.1.1.1/24],
>> +  [local 172.31.1.1])
>
>The "local" option is optional, I'll probably drop it since it's not
>necessary.

I tested with and without it and it appears to work

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH] system-traffic: Fix IPv6 frag vxlan check.

2016-04-21 Thread Daniele Di Proietto
Thanks for fixing this

Acked-by: Daniele Di Proietto <diproiet...@vmware.com>

On 21/04/2016 14:10, "Joe Stringer" <j...@ovn.org> wrote:

>This was missed before somehow, which would cause the test to fail
>(rather than being skipped) if iproute2 didn't support setting the
>vxlan dstport on the kernel tunnel device.
>
>Signed-off-by: Joe Stringer <j...@ovn.org>
>---
> tests/system-traffic.at | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
>diff --git a/tests/system-traffic.at b/tests/system-traffic.at
>index dceae150d148..10c571769462 100644
>--- a/tests/system-traffic.at
>+++ b/tests/system-traffic.at
>@@ -1495,7 +1495,7 @@ OVS_TRAFFIC_VSWITCHD_STOP
> AT_CLEANUP
> 
> AT_SETUP([conntrack - IPv6 Fragmentation over vxlan])
>-AT_SKIP_IF([! ip link help 2>&1 | grep vxlan >/dev/null])
>+OVS_CHECK_VXLAN()
> CHECK_CONNTRACK()
> 
> OVS_TRAFFIC_VSWITCHD_START()
>-- 
>2.1.4
>

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v8 07/16] hmap: Add HMAP_FOR_EACH_POP.

2016-04-21 Thread Daniele Di Proietto


On 21/04/2016 14:54, "Ben Pfaff" <b...@ovn.org> wrote:

>On Thu, Apr 21, 2016 at 09:41:03PM +, Daniele Di Proietto wrote:
>> 
>> 
>> On 21/04/2016 11:28, "Ben Pfaff" <b...@ovn.org> wrote:
>> 
>> >On Tue, Apr 19, 2016 at 03:28:39PM -0700, Daniele Di Proietto wrote:
>> >> Makes popping each member of the hmap a bit easier.
>> >> 
>> >> Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
>> >
>> >It's unfortunately quite expensive, though: O(n**2) in the number of
>> >buckets in the hmap, as opposed to O(n) for HMAP_FOR_EACH_SAFE.
>> 
>> You're right, I didn't realize that hmap_first() is O(n) in the number
>>of
>> buckets. Apologies for this oversight and thanks for noticing it.
>> 
>> How about this instead?
>> 
>> ---8<---
>> static inline struct hmap_node *
>> hmap_pop_helper__(struct hmap *hmap, size_t *bucket) {
>> 
>> for (; *bucket <= hmap->mask; (*bucket)++) {
>> struct hmap_node *node = hmap->buckets[*bucket];
>> 
>> if (node) {
>> hmap_remove(hmap, node);
>> return node;
>> }
>> }
>> 
>> return NULL;
>> }
>> 
>> #define HMAP_FOR_EACH_POP(NODE, MEMBER, HMAP) \
>> for (size_t bucket__ = 0;   \
>>  (INIT_CONTAINER(NODE, hmap_pop_helper__(HMAP, __),
>> MEMBER), \
>>   false) \
>>  || (NODE != OBJECT_CONTAINING(NULL, NODE, MEMBER)) || (NODE =
>> NULL);)
>> ---8<---
>> 
>> I wanted to introduce this because I found that sometimes having a
>> "next" local variable is too verbose, but if you don't think it's
>> worth I can drop this patch.
>
>Much better, thanks.
>
>You can write "(a, false) || b || c" as "a, b || c" though.

Right, I'll fold this in, thanks.

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v8 07/16] hmap: Add HMAP_FOR_EACH_POP.

2016-04-21 Thread Daniele Di Proietto


On 21/04/2016 11:28, "Ben Pfaff" <b...@ovn.org> wrote:

>On Tue, Apr 19, 2016 at 03:28:39PM -0700, Daniele Di Proietto wrote:
>> Makes popping each member of the hmap a bit easier.
>> 
>> Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
>
>It's unfortunately quite expensive, though: O(n**2) in the number of
>buckets in the hmap, as opposed to O(n) for HMAP_FOR_EACH_SAFE.

You're right, I didn't realize that hmap_first() is O(n) in the number of
buckets. Apologies for this oversight and thanks for noticing it.

How about this instead?

---8<---
static inline struct hmap_node *
hmap_pop_helper__(struct hmap *hmap, size_t *bucket) {

for (; *bucket <= hmap->mask; (*bucket)++) {
struct hmap_node *node = hmap->buckets[*bucket];

if (node) {
hmap_remove(hmap, node);
return node;
}
}

return NULL;
}

#define HMAP_FOR_EACH_POP(NODE, MEMBER, HMAP) \
for (size_t bucket__ = 0;   \
 (INIT_CONTAINER(NODE, hmap_pop_helper__(HMAP, __),
MEMBER), \
  false) \
 || (NODE != OBJECT_CONTAINING(NULL, NODE, MEMBER)) || (NODE =
NULL);)
---8<---

I wanted to introduce this because I found that sometimes having a
"next" local variable is too verbose, but if you don't think it's
worth I can drop this patch.

Thanks,

Daniele

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v8 10/16] dpif-netdev: Use hmap for ports.

2016-04-20 Thread Daniele Di Proietto


On 20/04/2016 07:21, "Ilya Maximets" <i.maxim...@samsung.com> wrote:

>On 20.04.2016 01:28, diproiettod at vmware.com (Daniele Di Proietto)
>wrote:
>> netdev objects are hard to use with RCU, because it's not possible to
>> split removal and reclamation.  Postponing the removal means that the
>> port is not removed and cannot be readded immediately.  Waiting for
>> reclamation means introducing a quiescent state, and that may introduce
>> subtle bugs, due to the RCU model we use in userspace.
>> 
>> This commit changes the port container from cmap to hmap.  'port_mutex'
>> must be held by readers and writers.  This shouldn't have performace
>> impact, as readers in the fast path use a thread local cache.
>> 
>> Signed-off-by: Daniele Di Proietto 
>> ---
>>  lib/dpif-netdev.c | 96
>>+--
>>  1 file changed, 57 insertions(+), 39 deletions(-)
>> 
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index bd2249e..8cc37e2 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -195,9 +195,10 @@ struct dp_netdev {
>>  
>>  /* Ports.
>>   *
>> - * Protected by RCU.  Take the mutex to add or remove ports. */
>> + * Any lookup into 'ports' or any access to the dp_netdev_ports
>>found
>> + * through 'ports' requires taking 'port_mutex'. */
>>  struct ovs_mutex port_mutex;
>> -struct cmap ports;
>> +struct hmap ports;
>>  struct seq *port_seq;   /* Incremented whenever a port
>>changes. */
>>  
>>  /* Protects access to ofproto-dpif-upcall interface during
>>revalidator
>> @@ -228,7 +229,8 @@ struct dp_netdev {
>>  };
>>  
>>  static struct dp_netdev_port *dp_netdev_lookup_port(const struct
>>dp_netdev *dp,
>> -odp_port_t);
>> +odp_port_t)
>> +OVS_REQUIRES(>port_mutex);
>
>OVS_REQUIRES(dp->port_mutex);
>here and 2 times more below.

I've changed them, thanks.  I think the analyzer accepts both (a pointer
or the
object itself), but I prefer the syntax you suggested.

>
>>  
>>  enum dp_stat_type {
>>  DP_STAT_EXACT_HIT,  /* Packets that had an exact match
>>(emc). */
>> @@ -248,7 +250,7 @@ enum pmd_cycles_counter_type {
>>  struct dp_netdev_port {
>>  odp_port_t port_no;
>>  struct netdev *netdev;
>> -struct cmap_node node;  /* Node in dp_netdev's 'ports'. */
>> +struct hmap_node node;  /* Node in dp_netdev's 'ports'. */
>>  struct netdev_saved_flags *sf;
>>  unsigned n_rxq; /* Number of elements in 'rxq' */
>>  struct netdev_rxq **rxq;
>> @@ -476,9 +478,11 @@ struct dpif_netdev {
>>  };
>>  
>>  static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
>> -  struct dp_netdev_port **portp);
>> +  struct dp_netdev_port **portp)
>> +OVS_REQUIRES(dp->port_mutex);
>>  static int get_port_by_name(struct dp_netdev *dp, const char *devname,
>> -struct dp_netdev_port **portp);
>> +struct dp_netdev_port **portp)
>> +OVS_REQUIRES(dp->port_mutex);
>>  static void dp_netdev_free(struct dp_netdev *)
>>  OVS_REQUIRES(dp_netdev_mutex);
>>  static int do_add_port(struct dp_netdev *dp, const char *devname,
>> @@ -522,7 +526,8 @@ dp_netdev_add_rxq_to_pmd(struct
>>dp_netdev_pmd_thread *pmd,
>>   struct dp_netdev_port *port, struct
>>netdev_rxq *rx);
>>  static struct dp_netdev_pmd_thread *
>>  dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id);
>> -static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp);
>> +static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
>> +OVS_REQUIRES(dp->port_mutex);
>>  static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
>>  static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
>>  static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
>> @@ -913,7 +918,7 @@ create_dp_netdev(const char *name, const struct
>>dpif_class *class,
>>  atomic_flag_clear(>destroyed);
>>  
>>  ovs_mutex_init(>port_mutex);
>> -cmap_init(>ports);
>> +hmap_init(>ports);
>>  dp->port_seq = seq_create();
>>  fat_rwlock_init(>upcall_rwlock);
>>  
>> @@ -984,7 +989,7 @@ static voi

Re: [ovs-dev] [PATCH v7 05/16] dpif-netdev: Fix race condition in pmd thread initialization.

2016-04-19 Thread Daniele Di Proietto


On 19/04/2016 02:48, "Ilya Maximets" <i.maxim...@samsung.com> wrote:

>On 19.04.2016 10:18, Ilya Maximets wrote:
>> There was a reason for 2 calls for dp_netdev_pmd_reload_done() inside
>> pmd_thread_main(). The reason is that we must wait until PMD thread
>> completely done with reloading. This patch introduces race condition
>> for pmd->exit_latch. While removing last port on numa node
>> dp_netdev_reload_pmd__(pmd) will be called twice for each port.
>> First call to remove port and second to destroy PMD thread.
>> pmd->exit_latch setted between this two calls. This leads to probable
>> situation when PMD thread will exit while processing first reloading.
>> Main thread will wait forever on cond_wait in second reload in this
>> case. Situation is easily reproducible by addition/deletion of last
>> port (may be after few iterations in a cycle).
>> 
>> Best regards, Ilya Maximets.
>
>This incremental should help:
>--
>diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>index 588d56f..2235297 100644
>--- a/lib/dpif-netdev.c
>+++ b/lib/dpif-netdev.c
>@@ -2785,6 +2785,7 @@ pmd_thread_main(void *f_)
> unsigned int port_seq = PMD_INITIAL_SEQ;
> int poll_cnt;
> int i;
>+bool exiting;
> 
> poll_cnt = 0;
> poll_list = NULL;
>@@ -2825,14 +2826,15 @@ reload:
> }
> }
> 
>+emc_cache_uninit(>flow_cache);
>+
> poll_cnt = pmd_load_queues_and_ports(pmd, _list);
>+exiting = latch_is_set(>exit_latch);
> /* Signal here to make sure the pmd finishes
>  * reloading the updated configuration. */
> dp_netdev_pmd_reload_done(pmd);
> 
>-emc_cache_uninit(>flow_cache);
>-
>-if (!latch_is_set(>exit_latch)){
>+if (!exiting) {
> goto reload;
> }
> 
>--

You're right, thanks for the detailed analysis and the suggested fix.

I applied the suggested incremental, but kept emc_cache_uninit()
where it is right now.

I sent an updated version here:

http://openvswitch.org/pipermail/dev/2016-April/069835.html

Thanks,

Daniele

>
> 
>> On 08.04.2016 06:13, Daniele Di Proietto wrote:
>>> The pmds and the main threads are synchronized using a condition
>>> variable.  The main thread writes a new configuration, then it waits on
>>> the condition variable.  A pmd thread reads the new configuration, then
>>> it calls signal() on the condition variable. To make sure that the pmds
>>> and the main thread have a consistent view, each signal() should be
>>> backed by a wait().
>>>
>>> Currently the first signal() doesn't have a corresponding wait().  If
>>> the pmd thread takes a long time to start and the signal() is received
>>> by a later wait, the threads will have an inconsistent view.
>>>
>>> The commit fixes the problem by removing the first signal() from the
>>> pmd thread.
>>>
>>> This is hardly a problem on current master, because the main thread
>>> will call the first wait() a long time after the creation of a pmd
>>> thread.  It becomes a problem with the next commits.
>>>
>>> Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
>>> ---
>>>  lib/dpif-netdev.c | 21 +
>>>  1 file changed, 9 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>> index 9c32c64..2424d3e 100644
>>> --- a/lib/dpif-netdev.c
>>> +++ b/lib/dpif-netdev.c
>>> @@ -2652,21 +2652,22 @@ dpif_netdev_wait(struct dpif *dpif)
>>>  
>>>  static int
>>>  pmd_load_queues(struct dp_netdev_pmd_thread *pmd, struct rxq_poll
>>>**ppoll_list)
>>> -OVS_REQUIRES(pmd->poll_mutex)
>>>  {
>>>  struct rxq_poll *poll_list = *ppoll_list;
>>>  struct rxq_poll *poll;
>>>  int i;
>>>  
>>> +ovs_mutex_lock(>poll_mutex);
>>>  poll_list = xrealloc(poll_list, pmd->poll_cnt * sizeof
>>>*poll_list);
>>>  
>>>  i = 0;
>>>  LIST_FOR_EACH (poll, node, >poll_list) {
>>>  poll_list[i++] = *poll;
>>>  }
>>> +ovs_mutex_unlock(>poll_mutex);
>>>  
>>>  *ppoll_list = poll_list;
>>> -return pmd->poll_cnt;
>>> +return i;
>>>  }
>>>  
>>>  static void *
>>> @@ -2685,13 +2686,10 @@ pmd_thread_main(void *f_)
>>>  /* Stores the p

[ovs-dev] [PATCH v8 16/16] netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

2016-04-19 Thread Daniele Di Proietto
This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c |  69 +-
 lib/netdev-bsd.c  |   2 +-
 lib/netdev-dpdk.c | 195 ++
 lib/netdev-dummy.c|   2 +-
 lib/netdev-linux.c|   2 +-
 lib/netdev-provider.h |  23 +++---
 lib/netdev-vport.c|   2 +-
 lib/netdev.c  |  36 +++---
 lib/netdev.h  |   3 +-
 9 files changed, 160 insertions(+), 174 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 3a250fd..cce3bf1 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -257,8 +257,6 @@ struct dp_netdev_port {
 unsigned n_rxq; /* Number of elements in 'rxq' */
 struct netdev_rxq **rxq;
 char *type; /* Port type as requested by user. */
-int latest_requested_n_rxq; /* Latest requested from netdev number
-   of rx queues. */
 };
 
 /* Contained by struct dp_netdev_flow's 'stats' member.  */
@@ -1161,20 +1159,26 @@ port_create(const char *devname, const char *open_type, 
const char *type,
 /* There can only be ovs_numa_get_n_cores() pmd threads,
  * so creates a txq for each, and one extra for the non
  * pmd threads. */
-error = netdev_set_multiq(netdev, n_cores + 1,
-  netdev_requested_n_rxq(netdev));
+error = netdev_set_tx_multiq(netdev, n_cores + 1);
 if (error && (error != EOPNOTSUPP)) {
 VLOG_ERR("%s, cannot set multiq", devname);
 goto out;
 }
 }
+
+if (netdev_is_reconf_required(netdev)) {
+error = netdev_reconfigure(netdev);
+if (error) {
+goto out;
+}
+}
+
 port = xzalloc(sizeof *port);
 port->port_no = port_no;
 port->netdev = netdev;
 port->n_rxq = netdev_n_rxq(netdev);
 port->rxq = xcalloc(port->n_rxq, sizeof *port->rxq);
 port->type = xstrdup(type);
-port->latest_requested_n_rxq = netdev_requested_n_rxq(netdev);
 
 for (i = 0; i < port->n_rxq; i++) {
 error = netdev_rxq_open(netdev, >rxq[i], i);
@@ -2455,27 +2459,6 @@ dpif_netdev_operate(struct dpif *dpif, struct dpif_op 
**ops, size_t n_ops)
 }
 }
 
-/* Returns true if the configuration for rx queues is changed. */
-static bool
-pmd_n_rxq_changed(const struct dp_netdev *dp)
-{
-struct dp_netdev_port *port;
-
-ovs_mutex_lock(>port_mutex);
-HMAP_FOR_EACH (port, node, >ports) {
-int requested_n_rxq = netdev_requested_n_rxq(port->netdev);
-
-if (netdev_is_pmd(port->netdev)
-&& port->latest_requested_n_rxq != requested_n_rxq) {
-ovs_mutex_unlock(>port_mutex);
-return true;
-}
-}
-ovs_mutex_unlock(>port_mutex);
-
-return false;
-}
-
 static bool
 cmask_equals(const char *a, const char *b)
 {
@@ -2599,11 +2582,9 @@ static int
 port_reconfigure(struct dp_netdev_port *port)
 {
 struct netdev *netdev = port->netdev;
-int requested_n_rxq = netdev_requested_n_rxq(netdev);
 int i, err;
 
-if (!netdev_is_pmd(port->netdev)
-|| port->latest_requested_n_rxq != requested_n_rxq) {
+if (!netdev_is_reconf_required(netdev)) {
 return 0;
 }
 
@@ -2614,15 +2595,14 @@ port_reconfigure(struct dp_netdev_port *port)
 }
 port->n_rxq = 0;
 
-/* Sets the new rx queue config. */
-err = netdev_set_multiq(port->netdev, ovs_numa_get_n_cores() + 1,
-requested_n_rxq);
+/* Allows 'netdev' to apply the pending configuration changes. */
+err = netdev_reconfigure(netdev);
 if (err && (err != EOPNOTSUPP)) {
-VLOG_ERR("Failed to set dpdk interface %s rx_queue to: %u",
- netdev_get_name(port->netdev), requested_n_rxq);
+VLOG_ERR("Failed to set interface %s new configuration",
+ netdev_get_name(netdev));
 return err;
 }
-/* If the set_multiq() above succeeds, reopens the 'rxq's. */
+/* If the netdev_reconfigure( above succeeds, reopens the 'rxq's. */
 port->rxq = xrealloc(port->rxq, sizeof *port->rxq * netdev_n_rxq(netdev));
 for (i = 0; i < netd

[ovs-dev] [PATCH v8 12/16] ofproto-dpif: Call dpif_poll_threads_set() before dpif_run().

2016-04-19 Thread Daniele Di Proietto
An upcoming commit will make dpif_poll_threads_set() record the
requested configuration and dpif_run() apply it, so it makes sense to
change the order.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
Tested-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavan...@intel.com>
---
 ofproto/ofproto-dpif.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 747482c..2aa12b7 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -534,6 +534,8 @@ type_run(const char *type)
 return 0;
 }
 
+/* This must be called before dpif_run() */
+dpif_poll_threads_set(backer->dpif, pmd_cpu_mask);
 
 if (dpif_run(backer->dpif)) {
 backer->need_revalidate = REV_RECONFIGURE;
@@ -562,8 +564,6 @@ type_run(const char *type)
 udpif_set_threads(backer->udpif, n_handlers, n_revalidators);
 }
 
-dpif_poll_threads_set(backer->dpif, pmd_cpu_mask);
-
 if (backer->need_revalidate) {
 struct ofproto_dpif *ofproto;
 struct simap_node *node;
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v8 13/16] dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

2016-04-19 Thread Daniele Di Proietto
Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c   | 144 ++--
 lib/dpif-provider.h |   3 +-
 2 files changed, 84 insertions(+), 63 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 8cc37e2..c905d1d 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -224,7 +224,9 @@ struct dp_netdev {
 ovsthread_key_t per_pmd_key;
 
 /* Cpu mask for pin of pmd threads. */
+char *requested_pmd_cmask;
 char *pmd_cmask;
+
 uint64_t last_tnl_conf_seq;
 };
 
@@ -2453,18 +2455,17 @@ dpif_netdev_operate(struct dpif *dpif, struct dpif_op 
**ops, size_t n_ops)
 }
 }
 
-/* Returns true if the configuration for rx queues or cpu mask
- * is changed. */
+/* Returns true if the configuration for rx queues is changed. */
 static bool
-pmd_config_changed(const struct dp_netdev *dp, const char *cmask)
+pmd_n_rxq_changed(const struct dp_netdev *dp)
 {
 struct dp_netdev_port *port;
 
 ovs_mutex_lock(>port_mutex);
 HMAP_FOR_EACH (port, node, >ports) {
-struct netdev *netdev = port->netdev;
-int requested_n_rxq = netdev_requested_n_rxq(netdev);
-if (netdev_is_pmd(netdev)
+int requested_n_rxq = netdev_requested_n_rxq(port->netdev);
+
+if (netdev_is_pmd(port->netdev)
 && port->latest_requested_n_rxq != requested_n_rxq) {
 ovs_mutex_unlock(>port_mutex);
 return true;
@@ -2472,69 +2473,29 @@ pmd_config_changed(const struct dp_netdev *dp, const 
char *cmask)
 }
 ovs_mutex_unlock(>port_mutex);
 
-if (dp->pmd_cmask != NULL && cmask != NULL) {
-return strcmp(dp->pmd_cmask, cmask);
-} else {
-return (dp->pmd_cmask != NULL || cmask != NULL);
+return false;
+}
+
+static bool
+cmask_equals(const char *a, const char *b)
+{
+if (a && b) {
+return !strcmp(a, b);
 }
+
+return a == NULL && b == NULL;
 }
 
-/* Resets pmd threads if the configuration for 'rxq's or cpu mask changes. */
+/* Changes the number or the affinity of pmd threads.  The changes are actually
+ * applied in dpif_netdev_run(). */
 static int
 dpif_netdev_pmd_set(struct dpif *dpif, const char *cmask)
 {
 struct dp_netdev *dp = get_dp_netdev(dpif);
 
-if (pmd_config_changed(dp, cmask)) {
-struct dp_netdev_port *port;
-
-dp_netdev_destroy_all_pmds(dp);
-
-ovs_mutex_lock(>port_mutex);
-HMAP_FOR_EACH (port, node, >ports) {
-struct netdev *netdev = port->netdev;
-int requested_n_rxq = netdev_requested_n_rxq(netdev);
-if (netdev_is_pmd(port->netdev)
-&& port->latest_requested_n_rxq != requested_n_rxq) {
-int i, err;
-
-/* Closes the existing 'rxq's. */
-for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
-netdev_rxq_close(port->rxq[i]);
-port->rxq[i] = NULL;
-}
-port->n_rxq = 0;
-
-/* Sets the new rx queue config.  */
-err = netdev_set_multiq(port->netdev,
-ovs_numa_get_n_cores() + 1,
-requested_n_rxq);
-if (err && (err != EOPNOTSUPP)) {
-VLOG_ERR("Failed to set dpdk interface %s rx_queue to:"
- " %u", netdev_get_name(port->netdev),
- requested_n_rxq);
-ovs_mutex_unlock(>port_mutex);
-return err;
-}
-port->latest_requested_n_rxq = requested_n_rxq;
-/* If the set_multiq() above succeeds, reopens the 'rxq's. */
-port->n_rxq = netdev_n_rxq(port->netdev);
-port->rxq = xrealloc(port->rxq, sizeof *port->rxq * 
port->n_rxq);
-for (i = 0; i < port->n_rxq; i++) {
-netdev_rxq_open(port->netdev, >rxq[i], i);
-}
-}
-}
-/* Reconfigures the cpu mask. */
-ovs_numa_set_cpu_mask(cmask);
-free(dp->pmd_cmask);
-dp->pmd_cmask = cmask ? xstrdup(cmask) : NULL;
-
-/* Restores the non-pmd. */
-dp_netdev_set_nonpmd(dp);
-/* Restores all pmd threads. */
-dp_netdev_reset_pmd_threads(dp);
-ovs_mutex_unlock(>port_mutex);
+if (!cmask_equals(dp->requested_pmd_cmask, cmask)) {
+free(dp->requested_pmd_cmask);
+dp->requested_pmd_cmask = cmask ? xstrdup(cmask) : NULL;
 }
 
 return 0;
@@ -2634,6 +2595,59 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 }
 }
 
+static void
+reconfigure_pmd_threads(struct dp_netdev *dp)
+OVS_REQUIRES(dp->port_mu

[ovs-dev] [PATCH v8 15/16] netdev: Add reconfigure request mechanism.

2016-04-19 Thread Daniele Di Proietto
A netdev provider, especially a PMD provider (like netdev DPDK) might
not be able to change some of its parameters (such as MTU, or number of
queues) without stopping everything and restarting.

This commit introduces a mechanism that allows a netdev provider to
request a restart (netdev_request_reconfigure()).  The upper layer can
be notified via netdev_wait_reconf_required() and
netdev_is_reconf_required().  After closing all the rxqs the upper layer
can finally call netdev_reconfigure(), to make sure that the new
configuration is in place.

This will be used by next commit to reconfigure rx and tx queues in
netdev-dpdk.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
Tested-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavan...@intel.com>
---
 lib/netdev-bsd.c  |  1 +
 lib/netdev-dpdk.c |  1 +
 lib/netdev-dummy.c|  1 +
 lib/netdev-linux.c|  1 +
 lib/netdev-provider.h | 27 ++-
 lib/netdev-vport.c|  1 +
 lib/netdev.c  | 39 +++
 lib/netdev.h  |  4 
 8 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
index 49c05f4..32e8f74 100644
--- a/lib/netdev-bsd.c
+++ b/lib/netdev-bsd.c
@@ -1536,6 +1536,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum 
netdev_flags off,
 netdev_bsd_arp_lookup, /* arp_lookup */  \
  \
 netdev_bsd_update_flags, \
+NULL, /* reconfigure */  \
  \
 netdev_bsd_rxq_alloc,\
 netdev_bsd_rxq_construct,\
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 208c5f5..c4ff039 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2720,6 +2720,7 @@ static const struct dpdk_qos_ops egress_policer_ops = {
 NULL,   /* arp_lookup */  \
   \
 netdev_dpdk_update_flags, \
+NULL,   /* reconfigure */ \
   \
 netdev_dpdk_rxq_alloc,\
 netdev_dpdk_rxq_construct,\
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index 5acb4e1..a001322 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -1275,6 +1275,7 @@ static const struct netdev_class dummy_class = {
 NULL,   /* arp_lookup */
 
 netdev_dummy_update_flags,
+NULL,   /* reconfigure */
 
 netdev_dummy_rxq_alloc,
 netdev_dummy_rxq_construct,
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 2c1ffec..1af08f3 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -2806,6 +2806,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum 
netdev_flags off,
 netdev_linux_arp_lookup,\
 \
 netdev_linux_update_flags,  \
+NULL,   /* reconfigure */   \
 \
 netdev_linux_rxq_alloc, \
 netdev_linux_rxq_construct, \
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index cda25eb..853fc44 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -52,6 +52,16 @@ struct netdev {
  * 'netdev''s flags, features, ethernet address, or carrier changes. */
 uint64_t change_seq;
 
+/* A netdev provider might be unable to change some of the device's
+ * parameter (n_rxq, mtu) when the device is in use.  In this case
+ * the provider can notify the upper layer by calling
+ * netdev_request_reconfigure().  The upper layer will react by stopping
+ * the operations on the device and calling netdev_reconfigure() to allow
+ * the configuration changes.  'last_reconfigure_seq' remembers the value
+ * of 'reconfigure_seq' when the last reconfiguration happened. */
+struct seq *reconfigure_seq;
+uint64_t last_reconfigure_seq;
+
 /* The core netdev code initializes these at netdev construction and only
  * provide read-only access to its client.  Netdev implementations may
  * modify them. */
@@ -64,7 +74,7 @@ struct netdev {
 struct ovs_list saved_flags_list; /* Contains "struct netdev_saved_flags". 
*/
 };
 
-static void
+static inline void
 netdev_change_seq_changed(const struct netdev *netdev_)
 {
 struct netdev *netdev = CONST_CAST(struct netdev *, netdev_);
@@ -75,6 +85,12 @@ netdev_change_seq_ch

[ovs-dev] [PATCH v8 14/16] dpif-netdev: Handle errors in reconfigure_pmd_threads().

2016-04-19 Thread Daniele Di Proietto
Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 78 ++-
 1 file changed, 48 insertions(+), 30 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c905d1d..3a250fd 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2595,44 +2595,62 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 }
 }
 
+static int
+port_reconfigure(struct dp_netdev_port *port)
+{
+struct netdev *netdev = port->netdev;
+int requested_n_rxq = netdev_requested_n_rxq(netdev);
+int i, err;
+
+if (!netdev_is_pmd(port->netdev)
+|| port->latest_requested_n_rxq != requested_n_rxq) {
+return 0;
+}
+
+/* Closes the existing 'rxq's. */
+for (i = 0; i < port->n_rxq; i++) {
+netdev_rxq_close(port->rxq[i]);
+port->rxq[i] = NULL;
+}
+port->n_rxq = 0;
+
+/* Sets the new rx queue config. */
+err = netdev_set_multiq(port->netdev, ovs_numa_get_n_cores() + 1,
+requested_n_rxq);
+if (err && (err != EOPNOTSUPP)) {
+VLOG_ERR("Failed to set dpdk interface %s rx_queue to: %u",
+ netdev_get_name(port->netdev), requested_n_rxq);
+return err;
+}
+/* If the set_multiq() above succeeds, reopens the 'rxq's. */
+port->rxq = xrealloc(port->rxq, sizeof *port->rxq * netdev_n_rxq(netdev));
+for (i = 0; i < netdev_n_rxq(netdev); i++) {
+err = netdev_rxq_open(netdev, >rxq[i], i);
+if (err) {
+return err;
+}
+port->n_rxq++;
+}
+
+return 0;
+}
+
 static void
 reconfigure_pmd_threads(struct dp_netdev *dp)
 OVS_REQUIRES(dp->port_mutex)
 {
-struct dp_netdev_port *port;
+struct dp_netdev_port *port, *next;
 
 dp_netdev_destroy_all_pmds(dp);
 
-HMAP_FOR_EACH (port, node, >ports) {
-struct netdev *netdev = port->netdev;
-int requested_n_rxq = netdev_requested_n_rxq(netdev);
-if (netdev_is_pmd(port->netdev)
-&& port->latest_requested_n_rxq != requested_n_rxq) {
-int i, err;
+HMAP_FOR_EACH_SAFE (port, next, node, >ports) {
+int err;
 
-/* Closes the existing 'rxq's. */
-for (i = 0; i < port->n_rxq; i++) {
-netdev_rxq_close(port->rxq[i]);
-port->rxq[i] = NULL;
-}
-port->n_rxq = 0;
-
-/* Sets the new rx queue config. */
-err = netdev_set_multiq(port->netdev, ovs_numa_get_n_cores() + 1,
-requested_n_rxq);
-if (err && (err != EOPNOTSUPP)) {
-VLOG_ERR("Failed to set dpdk interface %s rx_queue to: %u",
- netdev_get_name(port->netdev),
- requested_n_rxq);
-return;
-}
-port->latest_requested_n_rxq = requested_n_rxq;
-/* If the set_multiq() above succeeds, reopens the 'rxq's. */
-port->n_rxq = netdev_n_rxq(port->netdev);
-port->rxq = xrealloc(port->rxq, sizeof *port->rxq * port->n_rxq);
-for (i = 0; i < port->n_rxq; i++) {
-netdev_rxq_open(port->netdev, >rxq[i], i);
-}
+err = port_reconfigure(port);
+if (err) {
+hmap_remove(>ports, >node);
+seq_change(dp->port_seq);
+port_destroy(port);
 }
 }
 /* Reconfigures the cpu mask. */
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v8 11/16] ovs-thread: Do not quiesce in ovs_mutex_cond_wait().

2016-04-19 Thread Daniele Di Proietto
ovs_mutex_cond_wait() is used in many functions in dpif-netdev to
synchronize with pmd threads, but we can't guarantee that the callers do
not hold RCU references, so it's better to avoid quiescing.

In system_stats_thread_func() the code relied on ovs_mutex_cond_wait()
to introduce a quiescent state, so explicit calls to
ovsrcu_quiesce_start() and ovsrcu_quiesce_end() are added there.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/ovs-rcu.h   | 3 +--
 lib/ovs-thread.c| 2 --
 vswitchd/system-stats.c | 6 ++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/lib/ovs-rcu.h b/lib/ovs-rcu.h
index 600be0b..8750ead 100644
--- a/lib/ovs-rcu.h
+++ b/lib/ovs-rcu.h
@@ -42,8 +42,7 @@
  * A "quiescent state" is a time at which a thread holds no pointers to memory
  * that is managed by RCU; that is, when the thread is known not to reference
  * memory that might be an old version of some object freed via RCU.  For
- * example, poll_block() includes a quiescent state, as does
- * ovs_mutex_cond_wait().
+ * example, poll_block() includes a quiescent state.
  *
  * The following functions manage the recognition of quiescent states:
  *
diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c
index 3c065cf..26dd928 100644
--- a/lib/ovs-thread.c
+++ b/lib/ovs-thread.c
@@ -253,9 +253,7 @@ ovs_mutex_cond_wait(pthread_cond_t *cond, const struct 
ovs_mutex *mutex_)
 struct ovs_mutex *mutex = CONST_CAST(struct ovs_mutex *, mutex_);
 int error;
 
-ovsrcu_quiesce_start();
 error = pthread_cond_wait(cond, >lock);
-ovsrcu_quiesce_end();
 
 if (OVS_UNLIKELY(error)) {
 ovs_abort(error, "pthread_cond_wait failed");
diff --git a/vswitchd/system-stats.c b/vswitchd/system-stats.c
index df4971e..129f0cf 100644
--- a/vswitchd/system-stats.c
+++ b/vswitchd/system-stats.c
@@ -37,6 +37,7 @@
 #include "json.h"
 #include "latch.h"
 #include "openvswitch/ofpbuf.h"
+#include "ovs-rcu.h"
 #include "ovs-thread.h"
 #include "poll-loop.h"
 #include "shash.h"
@@ -615,7 +616,12 @@ system_stats_thread_func(void *arg OVS_UNUSED)
 
 ovs_mutex_lock();
 while (!enabled) {
+/* The thread is sleeping, potentially for a long time, and it's
+ * not holding RCU protected references, so it makes sense to
+ * quiesce */
+ovsrcu_quiesce_start();
 ovs_mutex_cond_wait(, );
+ovsrcu_quiesce_end();
 }
 ovs_mutex_unlock();
 
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v8 10/16] dpif-netdev: Use hmap for ports.

2016-04-19 Thread Daniele Di Proietto
netdev objects are hard to use with RCU, because it's not possible to
split removal and reclamation.  Postponing the removal means that the
port is not removed and cannot be readded immediately.  Waiting for
reclamation means introducing a quiescent state, and that may introduce
subtle bugs, due to the RCU model we use in userspace.

This commit changes the port container from cmap to hmap.  'port_mutex'
must be held by readers and writers.  This shouldn't have performace
impact, as readers in the fast path use a thread local cache.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 96 +--
 1 file changed, 57 insertions(+), 39 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index bd2249e..8cc37e2 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -195,9 +195,10 @@ struct dp_netdev {
 
 /* Ports.
  *
- * Protected by RCU.  Take the mutex to add or remove ports. */
+ * Any lookup into 'ports' or any access to the dp_netdev_ports found
+ * through 'ports' requires taking 'port_mutex'. */
 struct ovs_mutex port_mutex;
-struct cmap ports;
+struct hmap ports;
 struct seq *port_seq;   /* Incremented whenever a port changes. */
 
 /* Protects access to ofproto-dpif-upcall interface during revalidator
@@ -228,7 +229,8 @@ struct dp_netdev {
 };
 
 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
-odp_port_t);
+odp_port_t)
+OVS_REQUIRES(>port_mutex);
 
 enum dp_stat_type {
 DP_STAT_EXACT_HIT,  /* Packets that had an exact match (emc). */
@@ -248,7 +250,7 @@ enum pmd_cycles_counter_type {
 struct dp_netdev_port {
 odp_port_t port_no;
 struct netdev *netdev;
-struct cmap_node node;  /* Node in dp_netdev's 'ports'. */
+struct hmap_node node;  /* Node in dp_netdev's 'ports'. */
 struct netdev_saved_flags *sf;
 unsigned n_rxq; /* Number of elements in 'rxq' */
 struct netdev_rxq **rxq;
@@ -476,9 +478,11 @@ struct dpif_netdev {
 };
 
 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
-  struct dp_netdev_port **portp);
+  struct dp_netdev_port **portp)
+OVS_REQUIRES(dp->port_mutex);
 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
-struct dp_netdev_port **portp);
+struct dp_netdev_port **portp)
+OVS_REQUIRES(dp->port_mutex);
 static void dp_netdev_free(struct dp_netdev *)
 OVS_REQUIRES(dp_netdev_mutex);
 static int do_add_port(struct dp_netdev *dp, const char *devname,
@@ -522,7 +526,8 @@ dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
  struct dp_netdev_port *port, struct netdev_rxq *rx);
 static struct dp_netdev_pmd_thread *
 dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id);
-static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp);
+static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
+OVS_REQUIRES(dp->port_mutex);
 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
@@ -913,7 +918,7 @@ create_dp_netdev(const char *name, const struct dpif_class 
*class,
 atomic_flag_clear(>destroyed);
 
 ovs_mutex_init(>port_mutex);
-cmap_init(>ports);
+hmap_init(>ports);
 dp->port_seq = seq_create();
 fat_rwlock_init(>upcall_rwlock);
 
@@ -984,7 +989,7 @@ static void
 dp_netdev_free(struct dp_netdev *dp)
 OVS_REQUIRES(dp_netdev_mutex)
 {
-struct dp_netdev_port *port;
+struct dp_netdev_port *port, *next;
 
 shash_find_and_delete(_netdevs, dp->name);
 
@@ -993,15 +998,14 @@ dp_netdev_free(struct dp_netdev *dp)
 ovsthread_key_delete(dp->per_pmd_key);
 
 ovs_mutex_lock(>port_mutex);
-CMAP_FOR_EACH (port, node, >ports) {
-/* PMD threads are destroyed here. do_del_port() cannot quiesce */
+HMAP_FOR_EACH_SAFE (port, next, node, >ports) {
 do_del_port(dp, port);
 }
 ovs_mutex_unlock(>port_mutex);
 cmap_destroy(>poll_threads);
 
 seq_destroy(dp->port_seq);
-cmap_destroy(>ports);
+hmap_destroy(>ports);
 ovs_mutex_destroy(>port_mutex);
 
 /* Upcalls must be disabled at this point */
@@ -1222,7 +1226,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
 return error;
 }
 
-cmap_insert(>ports, >node, hash_port_no(port_no));
+hmap_insert(>ports, >node, hash_port_no(port_no));
 
 dp_netdev_add_port_to_pmds(dp, port);
 seq_change(dp->port_seq);
@@ -1288,10 

[ovs-dev] [PATCH v8 07/16] hmap: Add HMAP_FOR_EACH_POP.

2016-04-19 Thread Daniele Di Proietto
Makes popping each member of the hmap a bit easier.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/cfm.c|  5 ++---
 lib/hmap.h   |  4 
 lib/id-pool.c|  5 ++---
 lib/learning-switch.c|  5 ++---
 lib/netdev-linux.c   |  5 ++---
 lib/odp-util.c   |  7 +++
 ofproto/bond.c   | 10 --
 ofproto/in-band.c|  5 ++---
 ofproto/ofproto-dpif-ipfix.c |  5 ++---
 ofproto/ofproto-dpif-xlate.c |  5 ++---
 ofproto/ofproto.c|  5 ++---
 ofproto/pinsched.c   |  5 ++---
 ovn/controller-vtep/vtep.c   |  5 ++---
 ovn/controller/encaps.c  |  5 ++---
 ovn/controller/lport.c   |  5 ++---
 ovn/controller/ofctrl.c  |  5 ++---
 ovn/controller/physical.c|  4 +---
 ovn/controller/pinctrl.c |  5 ++---
 ovn/lib/expr.c   |  5 ++---
 ovn/northd/ovn-northd.c  | 10 --
 ovsdb/monitor.c  |  5 ++---
 ovsdb/row.c  |  5 ++---
 tests/library.at |  2 +-
 tests/test-hmap.c| 42 ++
 24 files changed, 93 insertions(+), 71 deletions(-)

diff --git a/lib/cfm.c b/lib/cfm.c
index cf1f725..fb077de 100644
--- a/lib/cfm.c
+++ b/lib/cfm.c
@@ -374,7 +374,7 @@ cfm_create(const struct netdev *netdev) OVS_EXCLUDED(mutex)
 void
 cfm_unref(struct cfm *cfm) OVS_EXCLUDED(mutex)
 {
-struct remote_mp *rmp, *rmp_next;
+struct remote_mp *rmp;
 
 if (!cfm) {
 return;
@@ -389,8 +389,7 @@ cfm_unref(struct cfm *cfm) OVS_EXCLUDED(mutex)
 hmap_remove(all_cfms, >hmap_node);
 ovs_mutex_unlock();
 
-HMAP_FOR_EACH_SAFE (rmp, rmp_next, node, >remote_mps) {
-hmap_remove(>remote_mps, >node);
+HMAP_FOR_EACH_POP (rmp, node, >remote_mps) {
 free(rmp);
 }
 
diff --git a/lib/hmap.h b/lib/hmap.h
index 53e75cc..08c4719 100644
--- a/lib/hmap.h
+++ b/lib/hmap.h
@@ -192,6 +192,10 @@ bool hmap_contains(const struct hmap *, const struct 
hmap_node *);
  __VA_ARGS__;   \
  (NODE != OBJECT_CONTAINING(NULL, NODE, MEMBER)) || (NODE = NULL); \
  ASSIGN_CONTAINER(NODE, hmap_next(HMAP, &(NODE)->MEMBER), MEMBER))
+#define HMAP_FOR_EACH_POP(NODE, MEMBER, HMAP)  \
+while (!hmap_is_empty(HMAP)\
+   && (INIT_CONTAINER(NODE, hmap_first(HMAP), MEMBER), 1)  \
+   && (hmap_remove(HMAP, &(NODE)->MEMBER), 1))
 
 static inline struct hmap_node *hmap_first(const struct hmap *);
 static inline struct hmap_node *hmap_next(const struct hmap *,
diff --git a/lib/id-pool.c b/lib/id-pool.c
index 6b93d37..f32c008 100644
--- a/lib/id-pool.c
+++ b/lib/id-pool.c
@@ -69,10 +69,9 @@ id_pool_init(struct id_pool *pool, uint32_t base, uint32_t 
n_ids)
 static void
 id_pool_uninit(struct id_pool *pool)
 {
-struct id_node *id_node, *next;
+struct id_node *id_node;
 
-HMAP_FOR_EACH_SAFE(id_node, next, node, >map) {
-hmap_remove(>map, _node->node);
+HMAP_FOR_EACH_POP(id_node, node, >map) {
 free(id_node);
 }
 
diff --git a/lib/learning-switch.c b/lib/learning-switch.c
index c69ca4c..b420fe5 100644
--- a/lib/learning-switch.c
+++ b/lib/learning-switch.c
@@ -269,11 +269,10 @@ void
 lswitch_destroy(struct lswitch *sw)
 {
 if (sw) {
-struct lswitch_port *node, *next;
+struct lswitch_port *node;
 
 rconn_destroy(sw->rconn);
-HMAP_FOR_EACH_SAFE (node, next, hmap_node, >queue_numbers) {
-hmap_remove(>queue_numbers, >hmap_node);
+HMAP_FOR_EACH_POP (node, hmap_node, >queue_numbers) {
 free(node);
 }
 shash_destroy(>queue_names);
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index a7d7ac7..2c1ffec 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -3851,10 +3851,9 @@ static void
 htb_tc_destroy(struct tc *tc)
 {
 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
-struct htb_class *hc, *next;
+struct htb_class *hc;
 
-HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, >tc.queues) {
-hmap_remove(>tc.queues, >tc_queue.hmap_node);
+HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, >tc.queues) {
 free(hc);
 }
 tc_destroy(tc);
diff --git a/lib/odp-util.c b/lib/odp-util.c
index b4689cc..3c75379 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -2081,10 +2081,9 @@ odp_portno_names_get(const struct hmap *portno_names, 
odp_port_t port_no)
 void
 odp_portno_names_destroy(struct hmap *portno_names)
 {
-struct odp_portno_names *odp_portno_names, *odp_portno_names_next;
-HMAP_FOR_EACH_SAFE (odp_portno_names, odp_portno_names_next,
-hmap_node, portno_names) {
-hmap_remove(portno_names, _portno_names->hmap_node);
+struct odp_portno_names *odp_po

[ovs-dev] [PATCH v8 09/16] hmap: Use struct for hmap_at_position().

2016-04-19 Thread Daniele Di Proietto
The interface will be more similar to the cmap.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/hmap.c | 26 --
 lib/hmap.h |  7 ++-
 lib/sset.c | 12 +---
 lib/sset.h |  7 ++-
 ofproto/ofproto-dpif.c |  8 +++-
 5 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/lib/hmap.c b/lib/hmap.c
index b70ce51..9462c5e 100644
--- a/lib/hmap.c
+++ b/lib/hmap.c
@@ -236,24 +236,22 @@ hmap_random_node(const struct hmap *hmap)
 }
 
 /* Returns the next node in 'hmap' in hash order, or NULL if no nodes remain in
- * 'hmap'.  Uses '*bucketp' and '*offsetp' to determine where to begin
- * iteration, and stores new values to pass on the next iteration into them
- * before returning.
+ * 'hmap'.  Uses '*pos' to determine where to begin iteration, and updates
+ * '*pos' to pass on the next iteration into them before returning.
  *
  * It's better to use plain HMAP_FOR_EACH and related functions, since they are
  * faster and better at dealing with hmaps that change during iteration.
  *
- * Before beginning iteration, store 0 into '*bucketp' and '*offsetp'.
- */
+ * Before beginning iteration, set '*pos' to all zeros. */
 struct hmap_node *
 hmap_at_position(const struct hmap *hmap,
- uint32_t *bucketp, uint32_t *offsetp)
+ struct hmap_position *pos)
 {
 size_t offset;
 size_t b_idx;
 
-offset = *offsetp;
-for (b_idx = *bucketp; b_idx <= hmap->mask; b_idx++) {
+offset = pos->offset;
+for (b_idx = pos->bucket; b_idx <= hmap->mask; b_idx++) {
 struct hmap_node *node;
 size_t n_idx;
 
@@ -261,11 +259,11 @@ hmap_at_position(const struct hmap *hmap,
  n_idx++, node = node->next) {
 if (n_idx == offset) {
 if (node->next) {
-*bucketp = node->hash & hmap->mask;
-*offsetp = offset + 1;
+pos->bucket = node->hash & hmap->mask;
+pos->offset = offset + 1;
 } else {
-*bucketp = (node->hash & hmap->mask) + 1;
-*offsetp = 0;
+pos->bucket = (node->hash & hmap->mask) + 1;
+pos->offset = 0;
 }
 return node;
 }
@@ -273,8 +271,8 @@ hmap_at_position(const struct hmap *hmap,
 offset = 0;
 }
 
-*bucketp = 0;
-*offsetp = 0;
+pos->bucket = 0;
+pos->offset = 0;
 return NULL;
 }
 
diff --git a/lib/hmap.h b/lib/hmap.h
index 08c4719..9a96c5f 100644
--- a/lib/hmap.h
+++ b/lib/hmap.h
@@ -201,8 +201,13 @@ static inline struct hmap_node *hmap_first(const struct 
hmap *);
 static inline struct hmap_node *hmap_next(const struct hmap *,
   const struct hmap_node *);
 
+struct hmap_position {
+unsigned int bucket;
+unsigned int offset;
+};
+
 struct hmap_node *hmap_at_position(const struct hmap *,
-   uint32_t *bucket, uint32_t *offset);
+   struct hmap_position *);
 
 /* Returns the number of nodes currently in 'hmap'. */
 static inline size_t
diff --git a/lib/sset.c b/lib/sset.c
index f9d4fc0..4fd3fae 100644
--- a/lib/sset.c
+++ b/lib/sset.c
@@ -251,21 +251,19 @@ sset_equals(const struct sset *a, const struct sset *b)
 }
 
 /* Returns the next node in 'set' in hash order, or NULL if no nodes remain in
- * 'set'.  Uses '*bucketp' and '*offsetp' to determine where to begin
- * iteration, and stores new values to pass on the next iteration into them
- * before returning.
+ * 'set'.  Uses '*pos' to determine where to begin iteration, and updates
+ * '*pos' to pass on the next iteration into them before returning.
  *
  * It's better to use plain SSET_FOR_EACH and related functions, since they are
  * faster and better at dealing with ssets that change during iteration.
  *
- * Before beginning iteration, store 0 into '*bucketp' and '*offsetp'.
- */
+ * Before beginning iteration, set '*pos' to all zeros. */
 struct sset_node *
-sset_at_position(const struct sset *set, uint32_t *bucketp, uint32_t *offsetp)
+sset_at_position(const struct sset *set, struct sset_position *pos)
 {
 struct hmap_node *hmap_node;
 
-hmap_node = hmap_at_position(>map, bucketp, offsetp);
+hmap_node = hmap_at_position(>map, >pos);
 return SSET_NODE_FROM_HMAP_NODE(hmap_node);
 }
 
diff --git a/lib/sset.h b/lib/sset.h
index 7d1d496..9c2f703 100644
--- a/lib/sset.h
+++ b/lib/sset.h
@@ -64,8 +64,13 @@ char *sset_pop(struct sset *);
 struct sset_node *sset_find(const struct sset *, const char *);
 bool sset_contains(const struct sset *, const char *);
 bool sset_equals(const struct sset *, const struct sset *);
+
+struct sset_position {
+struct hmap_position pos;
+};
+
 struct sset_node *sset

[ovs-dev] [PATCH v8 08/16] dpif-netdev: Add pmd thread local port cache for transmission.

2016-04-19 Thread Daniele Di Proietto
A future commit will stop using RCU for 'dp->ports' and use a mutex for
reading/writing them.  To avoid taking a mutex in dp_execute_cb(), which
is called in the fast path, this commit introduces a pmd thread local
cache of ports.

The downside is that every port add/remove now needs to synchronize with
every pmd thread.

Among the advantages, keeping a per thread port mapping could allow
greater control over the txq assigment.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 249 +++---
 1 file changed, 179 insertions(+), 70 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index cedaf39..bd2249e 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -184,6 +184,7 @@ static bool dpcls_lookup(const struct dpcls *cls,
  *
  *dp_netdev_mutex (global)
  *port_mutex
+ *non_pmd_mutex
  */
 struct dp_netdev {
 const struct dpif_class *const class;
@@ -379,6 +380,13 @@ struct rxq_poll {
 struct ovs_list node;
 };
 
+/* Contained by struct dp_netdev_pmd_thread's 'port_cache' or 'tx_ports'. */
+struct tx_port {
+odp_port_t port_no;
+struct netdev *netdev;
+struct hmap_node node;
+};
+
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
  * the performance overhead of interrupt processing.  Therefore netdev can
  * not implement rx-wait for these devices.  dpif-netdev needs to poll
@@ -405,8 +413,8 @@ struct dp_netdev_pmd_thread {
 
 /* Per thread exact-match cache.  Note, the instance for cpu core
  * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
- * need to be protected (e.g. by 'dp_netdev_mutex').  All other
- * instances will only be accessed by its own pmd thread. */
+ * need to be protected by 'non_pmd_mutex'.  Every other instance
+ * will only be accessed by its own pmd thread. */
 struct emc_cache flow_cache;
 
 /* Classifier and Flow-Table.
@@ -435,10 +443,20 @@ struct dp_netdev_pmd_thread {
 atomic_int tx_qid;  /* Queue id used by this pmd thread to
  * send packets on all netdevs */
 
-struct ovs_mutex poll_mutex;/* Mutex for poll_list. */
+struct ovs_mutex port_mutex;/* Mutex for 'poll_list' and 'tx_ports'. */
 /* List of rx queues to poll. */
 struct ovs_list poll_list OVS_GUARDED;
-int poll_cnt;   /* Number of elemints in poll_list. */
+/* Number of elements in 'poll_list' */
+int poll_cnt;
+/* Map of 'tx_port's used for transmission.  Written by the main thread,
+ * read by the pmd thread. */
+struct hmap tx_ports OVS_GUARDED;
+
+/* Map of 'tx_port' used in the fast path. This is a thread-local copy of
+ * 'tx_ports'. The instance for cpu core NON_PMD_CORE_ID can be accessed
+ * by multiple threads, and thusly need to be protected by 'non_pmd_mutex'.
+ * Every other instance will only be accessed by its own pmd thread. */
+struct hmap port_cache;
 
 /* Only a pmd thread can write on its own 'cycles' and 'stats'.
  * The main thread keeps 'stats_zero' and 'cycles_zero' as base
@@ -494,7 +512,7 @@ dp_netdev_pmd_get_next(struct dp_netdev *dp, struct 
cmap_position *pos);
 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
 static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
 static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id);
-static void dp_netdev_pmd_clear_poll_list(struct dp_netdev_pmd_thread *pmd);
+static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
  struct dp_netdev_port *port);
 static void
@@ -508,6 +526,8 @@ static void dp_netdev_reset_pmd_threads(struct dp_netdev 
*dp);
 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
+static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
+OVS_REQUIRES(pmd->port_mutex);
 
 static inline bool emc_entry_alive(struct emc_entry *ce);
 static void emc_clear_entry(struct emc_entry *ce);
@@ -690,7 +710,7 @@ pmd_info_show_rxq(struct ds *reply, struct 
dp_netdev_pmd_thread *pmd)
 ds_put_format(reply, "pmd thread numa_id %d core_id %u:\n",
   pmd->numa_id, pmd->core_id);
 
-ovs_mutex_lock(>poll_mutex);
+ovs_mutex_lock(>port_mutex);
 LIST_FOR_EACH (poll, node, >poll_list) {
 const char *name = netdev_get_name(poll->port->netdev);
 
@@ -704,7 +724,7 @@ pmd_info_show_rxq(struct ds *reply, struct 
dp_netdev_pmd_thread *pmd)
 ds_put_format(reply, " %d", netdev_rxq_get_queue_id(poll->rx));
 prev_name 

[ovs-dev] [PATCH v8 06/16] dpif-netdev: Remove duplicate code in dp_netdev_set_pmds_on_numa().

2016-04-19 Thread Daniele Di Proietto
Instead of duplicating code to add ports in
dp_netdev_set_pmds_on_numa(), we can always use
dp_netdev_add_port_to_pmds__().

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 58 +--
 1 file changed, 22 insertions(+), 36 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index fbd23cf..cedaf39 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3113,13 +3113,12 @@ dp_netdev_add_port_to_pmds__(struct dp_netdev *dp, 
struct dp_netdev_port *port,
 
 /* Cannot create pmd threads for invalid numa node. */
 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
+dp_netdev_set_pmds_on_numa(dp, numa_id);
 
 for (i = 0; i < port->n_rxq; i++) {
 pmd = dp_netdev_less_loaded_pmd_on_numa(dp, numa_id);
 if (!pmd) {
-/* There is no pmd threads on this numa node. */
-dp_netdev_set_pmds_on_numa(dp, numa_id);
-/* Assigning of rx queues done. */
+VLOG_WARN("There's no pmd thread on numa node %d", numa_id);
 break;
 }
 
@@ -3158,9 +3157,9 @@ dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int 
numa_id)
 int n_pmds;
 
 if (!ovs_numa_numa_id_is_valid(numa_id)) {
-VLOG_ERR("Cannot create pmd threads due to numa id (%d)"
- "invalid", numa_id);
-return ;
+VLOG_WARN("Cannot create pmd threads due to numa id (%d) invalid",
+  numa_id);
+return;
 }
 
 n_pmds = get_n_pmd_threads_on_numa(dp, numa_id);
@@ -3169,46 +3168,25 @@ dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int 
numa_id)
  * in which 'netdev' is on, do nothing.  Else, creates the
  * pmd threads for the numa node. */
 if (!n_pmds) {
-int can_have, n_unpinned, i, index = 0;
-struct dp_netdev_pmd_thread **pmds;
-struct dp_netdev_port *port;
+int can_have, n_unpinned, i;
 
 n_unpinned = ovs_numa_get_n_unpinned_cores_on_numa(numa_id);
 if (!n_unpinned) {
-VLOG_ERR("Cannot create pmd threads due to out of unpinned "
- "cores on numa node %d", numa_id);
+VLOG_WARN("Cannot create pmd threads due to out of unpinned "
+  "cores on numa node %d", numa_id);
 return;
 }
 
 /* If cpu mask is specified, uses all unpinned cores, otherwise
  * tries creating NR_PMD_THREADS pmd threads. */
 can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, 
NR_PMD_THREADS);
-pmds = xzalloc(can_have * sizeof *pmds);
 for (i = 0; i < can_have; i++) {
 unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
-pmds[i] = xzalloc(sizeof **pmds);
-dp_netdev_configure_pmd(pmds[i], dp, core_id, numa_id);
-}
+struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
 
-/* Distributes rx queues of this numa node between new pmd threads. */
-CMAP_FOR_EACH (port, node, >ports) {
-if (netdev_is_pmd(port->netdev)
-&& netdev_get_numa_id(port->netdev) == numa_id) {
-for (i = 0; i < port->n_rxq; i++) {
-/* Make thread-safety analyser happy. */
-ovs_mutex_lock([index]->poll_mutex);
-dp_netdev_add_rxq_to_pmd(pmds[index], port, port->rxq[i]);
-ovs_mutex_unlock([index]->poll_mutex);
-index = (index + 1) % can_have;
-}
-}
-}
-
-/* Actual start of pmd threads. */
-for (i = 0; i < can_have; i++) {
-pmds[i]->thread = ovs_thread_create("pmd", pmd_thread_main, 
pmds[i]);
+dp_netdev_configure_pmd(pmd, dp, core_id, numa_id);
+pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
 }
-free(pmds);
 VLOG_INFO("Created %d pmd threads on numa node %d", can_have, numa_id);
 }
 }
@@ -3220,14 +3198,22 @@ static void
 dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
 {
 struct dp_netdev_port *port;
+struct hmapx to_reload = HMAPX_INITIALIZER(_reload);
+struct hmapx_node *node;
 
 CMAP_FOR_EACH (port, node, >ports) {
 if (netdev_is_pmd(port->netdev)) {
-int numa_id = netdev_get_numa_id(port->netdev);
-
-dp_netdev_set_pmds_on_numa(dp, numa_id);
+dp_netdev_add_port_to_pmds__(dp, port, _reload);
 }
 }
+
+HMAPX_FOR_EACH (node, _reload) {
+struct dp_netdev_pmd_thread *pmd;
+pmd = (struct dp_netdev_pmd_thread *) node->data;
+dp_netdev_reload_pmd__(pmd);
+}
+
+hmapx_destroy(_reload);
 }
 
 static char *
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v8 05/16] dpif-netdev: Fix race condition in pmd thread initialization.

2016-04-19 Thread Daniele Di Proietto
The pmds and the main threads are synchronized using a condition
variable.  The main thread writes a new configuration, then it waits on
the condition variable.  A pmd thread reads the new configuration, then
it calls signal() on the condition variable. To make sure that the pmds
and the main thread have a consistent view, each signal() should be
backed by a wait().

Currently the first signal() doesn't have a corresponding wait().  If
the pmd thread takes a long time to start and the signal() is received
by a later wait, the threads will have an inconsistent view.

The commit fixes the problem by removing the first signal() from the
pmd thread.

This is hardly a problem on current master, because the main thread
will call the first wait() a long time after the creation of a pmd
thread.  It becomes a problem with the next commits.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c3be4eb..fbd23cf 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2651,21 +2651,22 @@ dpif_netdev_wait(struct dpif *dpif)
 
 static int
 pmd_load_queues(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **ppoll_list)
-OVS_REQUIRES(pmd->poll_mutex)
 {
 struct rxq_poll *poll_list = *ppoll_list;
 struct rxq_poll *poll;
 int i;
 
+ovs_mutex_lock(>poll_mutex);
 poll_list = xrealloc(poll_list, pmd->poll_cnt * sizeof *poll_list);
 
 i = 0;
 LIST_FOR_EACH (poll, node, >poll_list) {
 poll_list[i++] = *poll;
 }
+ovs_mutex_unlock(>poll_mutex);
 
 *ppoll_list = poll_list;
-return pmd->poll_cnt;
+return i;
 }
 
 static void *
@@ -2675,6 +2676,7 @@ pmd_thread_main(void *f_)
 unsigned int lc = 0;
 struct rxq_poll *poll_list;
 unsigned int port_seq = PMD_INITIAL_SEQ;
+bool exiting;
 int poll_cnt;
 int i;
 
@@ -2684,13 +2686,10 @@ pmd_thread_main(void *f_)
 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
 pmd_thread_setaffinity_cpu(pmd->core_id);
+poll_cnt = pmd_load_queues(pmd, _list);
 reload:
 emc_cache_init(>flow_cache);
 
-ovs_mutex_lock(>poll_mutex);
-poll_cnt = pmd_load_queues(pmd, _list);
-ovs_mutex_unlock(>poll_mutex);
-
 /* List port/core affinity */
 for (i = 0; i < poll_cnt; i++) {
VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
@@ -2698,10 +2697,6 @@ reload:
 netdev_rxq_get_queue_id(poll_list[i].rx));
 }
 
-/* Signal here to make sure the pmd finishes
- * reloading the updated configuration. */
-dp_netdev_pmd_reload_done(pmd);
-
 for (;;) {
 for (i = 0; i < poll_cnt; i++) {
 dp_netdev_process_rxq_port(pmd, poll_list[i].port, 
poll_list[i].rx);
@@ -2724,14 +2719,18 @@ reload:
 }
 }
 
+poll_cnt = pmd_load_queues(pmd, _list);
+exiting = latch_is_set(>exit_latch);
+/* Signal here to make sure the pmd finishes
+ * reloading the updated configuration. */
+dp_netdev_pmd_reload_done(pmd);
+
 emc_cache_uninit(>flow_cache);
 
-if (!latch_is_set(>exit_latch)){
+if (!exiting) {
 goto reload;
 }
 
-dp_netdev_pmd_reload_done(pmd);
-
 free(poll_list);
 return NULL;
 }
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v8 03/16] dpif-netdev: Factor out port_create() from do_add_port().

2016-04-19 Thread Daniele Di Proietto
Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 69 ++-
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 060f5e0..a224b43 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1095,29 +1095,22 @@ hash_port_no(odp_port_t port_no)
 }
 
 static int
-do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
-odp_port_t port_no)
-OVS_REQUIRES(dp->port_mutex)
+port_create(const char *devname, const char *open_type, const char *type,
+odp_port_t port_no, struct dp_netdev_port **portp)
 {
 struct netdev_saved_flags *sf;
 struct dp_netdev_port *port;
-struct netdev *netdev;
 enum netdev_flags flags;
-const char *open_type;
-int error = 0;
-int i, n_open_rxqs = 0;
+struct netdev *netdev;
+int n_open_rxqs = 0;
+int i, error;
 
-/* Reject devices already in 'dp'. */
-if (!get_port_by_name(dp, devname, )) {
-error = EEXIST;
-goto out;
-}
+*portp = NULL;
 
 /* Open and validate network device. */
-open_type = dpif_netdev_port_open_type(dp->class, type);
 error = netdev_open(devname, open_type, );
 if (error) {
-goto out;
+return error;
 }
 /* XXX reject non-Ethernet devices */
 
@@ -1125,7 +1118,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
 if (flags & NETDEV_LOOPBACK) {
 VLOG_ERR("%s: cannot add a loopback device", devname);
 error = EINVAL;
-goto out_close;
+goto out;
 }
 
 if (netdev_is_pmd(netdev)) {
@@ -1134,7 +1127,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
 if (n_cores == OVS_CORE_UNSPEC) {
 VLOG_ERR("%s, cannot get cpu core info", devname);
 error = ENOENT;
-goto out_close;
+goto out;
 }
 /* There can only be ovs_numa_get_n_cores() pmd threads,
  * so creates a txq for each, and one extra for the non
@@ -1143,14 +1136,14 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
   netdev_requested_n_rxq(netdev));
 if (error && (error != EOPNOTSUPP)) {
 VLOG_ERR("%s, cannot set multiq", devname);
-goto out_close;
+goto out;
 }
 }
 port = xzalloc(sizeof *port);
 port->port_no = port_no;
 port->netdev = netdev;
 port->n_rxq = netdev_n_rxq(netdev);
-port->rxq = xmalloc(sizeof *port->rxq * port->n_rxq);
+port->rxq = xcalloc(port->n_rxq, sizeof *port->rxq);
 port->type = xstrdup(type);
 port->latest_requested_n_rxq = netdev_requested_n_rxq(netdev);
 
@@ -1170,12 +1163,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
 }
 port->sf = sf;
 
-cmap_insert(>ports, >node, hash_port_no(port_no));
-
-if (netdev_is_pmd(netdev)) {
-dp_netdev_add_port_to_pmds(dp, port);
-}
-seq_change(dp->port_seq);
+*portp = port;
 
 return 0;
 
@@ -1186,13 +1174,42 @@ out_rxq_close:
 free(port->type);
 free(port->rxq);
 free(port);
-out_close:
-netdev_close(netdev);
+
 out:
+netdev_close(netdev);
 return error;
 }
 
 static int
+do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
+odp_port_t port_no)
+OVS_REQUIRES(dp->port_mutex)
+{
+struct dp_netdev_port *port;
+int error;
+
+/* Reject devices already in 'dp'. */
+if (!get_port_by_name(dp, devname, )) {
+return EEXIST;
+}
+
+error = port_create(devname, dpif_netdev_port_open_type(dp->class, type),
+type, port_no, );
+if (error) {
+return error;
+}
+
+cmap_insert(>ports, >node, hash_port_no(port_no));
+
+if (netdev_is_pmd(port->netdev)) {
+dp_netdev_add_port_to_pmds(dp, port);
+}
+seq_change(dp->port_seq);
+
+return 0;
+}
+
+static int
 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
  odp_port_t *port_nop)
 {
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v8 04/16] dpif-netdev: Add functions to modify rxq without reloading pmd threads.

2016-04-19 Thread Daniele Di Proietto
This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 77 ---
 1 file changed, 56 insertions(+), 21 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index a224b43..c3be4eb 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -495,8 +495,6 @@ static void dp_netdev_destroy_all_pmds(struct dp_netdev 
*dp);
 static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
 static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id);
 static void dp_netdev_pmd_clear_poll_list(struct dp_netdev_pmd_thread *pmd);
-static void dp_netdev_del_port_from_pmd(struct dp_netdev_port *port,
-struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
  struct dp_netdev_port *port);
 static void
@@ -3002,11 +3000,11 @@ dp_netdev_pmd_clear_poll_list(struct 
dp_netdev_pmd_thread *pmd)
 ovs_mutex_unlock(>poll_mutex);
 }
 
-/* Deletes all rx queues of 'port' from poll_list of pmd thread and
- * reloads it if poll_list was changed. */
-static void
-dp_netdev_del_port_from_pmd(struct dp_netdev_port *port,
-struct dp_netdev_pmd_thread *pmd)
+/* Deletes all rx queues of 'port' from poll_list of pmd thread.  Returns true
+ * if 'port' was found in 'pmd' (therefore a restart is required). */
+static bool
+dp_netdev_del_port_from_pmd__(struct dp_netdev_port *port,
+  struct dp_netdev_pmd_thread *pmd)
 {
 struct rxq_poll *poll, *next;
 bool found = false;
@@ -3021,8 +3019,30 @@ dp_netdev_del_port_from_pmd(struct dp_netdev_port *port,
 }
 }
 ovs_mutex_unlock(>poll_mutex);
-if (found) {
-dp_netdev_reload_pmd__(pmd);
+
+return found;
+}
+
+/* Deletes all rx queues of 'port' from all pmd threads.  The pmd threads that
+ * need to be restarted are inserted in 'to_reload'. */
+static void
+dp_netdev_del_port_from_all_pmds__(struct dp_netdev *dp,
+   struct dp_netdev_port *port,
+   struct hmapx *to_reload)
+{
+int numa_id = netdev_get_numa_id(port->netdev);
+struct dp_netdev_pmd_thread *pmd;
+
+CMAP_FOR_EACH (pmd, node, >poll_threads) {
+if (pmd->numa_id == numa_id) {
+bool found;
+
+found = dp_netdev_del_port_from_pmd__(port, pmd);
+
+if (found) {
+hmapx_add(to_reload, pmd);
+}
+   }
 }
 }
 
@@ -3032,16 +3052,21 @@ static void
 dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
  struct dp_netdev_port *port)
 {
-int numa_id = netdev_get_numa_id(port->netdev);
 struct dp_netdev_pmd_thread *pmd;
+struct hmapx to_reload = HMAPX_INITIALIZER(_reload);
+struct hmapx_node *node;
 
-CMAP_FOR_EACH (pmd, node, >poll_threads) {
-if (pmd->numa_id == numa_id) {
-dp_netdev_del_port_from_pmd(port, pmd);
-   }
+dp_netdev_del_port_from_all_pmds__(dp, port, _reload);
+
+HMAPX_FOR_EACH (node, _reload) {
+pmd = (struct dp_netdev_pmd_thread *) node->data;
+dp_netdev_reload_pmd__(pmd);
 }
+
+hmapx_destroy(_reload);
 }
 
+
 /* Returns PMD thread from this numa node with fewer rx queues to poll.
  * Returns NULL if there is no PMD threads on this numa node.
  * Can be called safely only by main thread. */
@@ -3077,18 +3102,16 @@ dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread 
*pmd,
 pmd->poll_cnt++;
 }
 
-/* Distributes all rx queues of 'port' between all PMD threads and reloads
- * them if needed. */
+/* Distributes all rx queues of 'port' between all PMD threads in 'dp'. The
+ * pmd threads that need to be restarted are inserted in 'to_reload'. */
 static void
-dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct dp_netdev_port *port)
+dp_netdev_add_port_to_pmds__(struct dp_netdev *dp, struct dp_netdev_port *port,
+ struct hmapx *to_reload)
 {
 int numa_id = netdev_get_numa_id(port->netdev);
 struct dp_netdev_pmd_thread *pmd;
-struct hmapx to_reload;
-struct hmapx_node *node;
 int i;
 
-hmapx_init(_reload);
 /* Cannot create pmd threads for invalid numa node. */
 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
 
@@ -3105,8 +3128,20 @@ dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct 
dp_netdev_port *port)
 dp_netdev_add_rxq_to_pmd(pmd, port, port->rxq[i]);
 ovs_mutex_unlock(>poll_mutex);
 
-hmapx_add(_reload, pmd);
+hmapx_add(to_reload, pmd);
 }
+}
+
+/* Distributes all rx queues of 'port' between all PMD threads in 'dp' and
+ * reloads them, if needed. */
+s

[ovs-dev] [PATCH v8 01/16] dpif-netdev: Destroy 'port_mutex' in dp_netdev_free().

2016-04-19 Thread Daniele Di Proietto
Found by inspection.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 1e8a37c..24717cc 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -986,6 +986,7 @@ dp_netdev_free(struct dp_netdev *dp)
 
 seq_destroy(dp->port_seq);
 cmap_destroy(>ports);
+ovs_mutex_destroy(>port_mutex);
 
 /* Upcalls must be disabled at this point */
 dp_netdev_destroy_upcall_lock(dp);
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v8 00/16] Reconfigure netdev at runtime

2016-04-19 Thread Daniele Di Proietto
Currently we treat set_multiq() calls specially in netdev and dpif-netdev:
every pmd thread must be stopped and set_multiq() is allowed to destroy and
recreate the device.

I think we can improve this by:
* Generalizing the mechanism to allow changing other parameters at runtime
  (such as MTU).
* Involving less the above layer (dpif-netdev).  The request for changes
  often comes from below (netdev_dpdk_set_config(), or the vhost new_device()
  callback).  There's no need for dpif-netdev to remember the requested value,
  all that it needs to know is that a configuration change is requested.

This series implements exactly this: a mechanism to allow a netdev provider
to request configuration changes, to which dpif-netdev will respond by
stopping rx/tx and calling a netdev function to appy the new configuration.

The new mechanism is used in this series to replace the set_multiq() call,
but the idea is to use it also at least for:

* Changing the MTU at runtime
* Automatically detecting the number of rx queues for a vhost-user device
* Move a DPDK vhost device to the proper NUMA socket

The first commits refactor some code in dpif-netdev and, most importantly
avoid using RCU for ports.  Each thread will have its local copy of all the
ports in the datapath.

The series is also available here:

https://github.com/ddiproietto/ovs/tree/configchangesv8

v8:
* Update comment in rcu.h: ovs_mutex_cond_wait doesn't quiesce.
* Change 'set_multiq' to 'set_tx_multiq'.
* Added documentation in comments and commit messages explaining thread local
  port cache.
* Fixed style issues reported by checkpatch.py.
* Fixed race condition when deleting pmd thread.

v7:
* Dropped already applied patches.
* Stop using RCU for ports.
* Rebased against master.

v6:
* Rebased against master.
* Check return value of netdev_rxq_open().
* Fix comment.

v5:
* Style fixes.
* Fixed a bug in dp_netdev_free() in patch 6.

v4:
* Added another patch to uniform names of variables in netdev-dpdk (no
  functional change)
* Update some netdev comments to document the relation between
  netdev_set_multiq() and netdev_reconfigure()
* Clarify that when netdev_reconfigure() is called no call to netdev_send()
  or netdev_rxq_recv() must be issued.
* Move check to skip reconfiguration in netdev_dpdk_reconfigure() before
  rte_eth_dev_stop().

v3:
* Fixed another outdated comment about rx queue configuration, as pointed out
  by Mark
* Removed unnecessary and buggy initialization of requested_n_rxq in
  reconfigure_pmd_threads().
* Removed unused 'err' variable in netdev_dpdk_set_multiq().
* Changed comparison in netdev_set_multiq() to use previous
  'netdev->requested_n_txq' instead of 'netdev->up.n_txq'
* Return immediately in netdev_dpdk_reconfigure() if configuration didn't
  change anything.

v2:
* Fixed do_add_port(): we have to call netdev_reconfigure() before opening
  the rxqs.  This prevents memory leaks, and makes sure that the datapath
  polls the appropriate number of queues
* Fixed netdev_dpdk_vhost_set_multiq(): it must call
  netdev_request_reconfigure(). Since it is now equal to
  netdev_dpdk_set_multiq(), the two function have been merged.
* Fixed netdev_dpdk_set_config(): dev->requested_n_rxq is now accessed
  while holding the appropriate mutex.
* Fixed some outdated comments about rx queue configuration.


Daniele Di Proietto (16):
  dpif-netdev: Destroy 'port_mutex' in dp_netdev_free().
  dpif-netdev: Remove unused 'index' in dp_netdev_pmd_thread.
  dpif-netdev: Factor out port_create() from do_add_port().
  dpif-netdev: Add functions to modify rxq without reloading pmd
threads.
  dpif-netdev: Fix race condition in pmd thread initialization.
  dpif-netdev: Remove duplicate code in dp_netdev_set_pmds_on_numa().
  hmap: Add HMAP_FOR_EACH_POP.
  dpif-netdev: Add pmd thread local port cache for transmission.
  hmap: Use struct for hmap_at_position().
  dpif-netdev: Use hmap for ports.
  ovs-thread: Do not quiesce in ovs_mutex_cond_wait().
  ofproto-dpif: Call dpif_poll_threads_set() before dpif_run().
  dpif-netdev: Change pmd thread configuration in dpif_netdev_run().
  dpif-netdev: Handle errors in reconfigure_pmd_threads().
  netdev: Add reconfigure request mechanism.
  netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

 lib/cfm.c|   5 +-
 lib/dpif-netdev.c| 702 +++
 lib/dpif-provider.h  |   3 +-
 lib/hmap.c   |  26 +-
 lib/hmap.h   |  11 +-
 lib/id-pool.c|   5 +-
 lib/learning-switch.c|   5 +-
 lib/netdev-bsd.c |   3 +-
 lib/netdev-dpdk.c| 194 ++--
 lib/netdev-dummy.c   |   3 +-
 lib/netdev-linux.c   |   8 +-
 lib/netdev-provider.h|  50 ++-
 lib/netdev-vport.c   |   3 +-
 lib/netdev.c |  75 +++--
 lib/netdev.h |   7 +-
 lib/odp-util.c   |   7 +-
 l

[ovs-dev] [PATCH v8 02/16] dpif-netdev: Remove unused 'index' in dp_netdev_pmd_thread.

2016-04-19 Thread Daniele Di Proietto
Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 24717cc..060f5e0 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -430,8 +430,6 @@ struct dp_netdev_pmd_thread {
 struct latch exit_latch;/* For terminating the pmd thread. */
 atomic_uint change_seq; /* For reloading pmd ports. */
 pthread_t thread;
-int index;  /* Idx of this pmd thread among pmd*/
-/* threads on same numa node. */
 unsigned core_id;   /* CPU core id of this pmd thread. */
 int numa_id;/* numa node id of this pmd thread. */
 atomic_int tx_qid;  /* Queue id used by this pmd thread to
@@ -485,8 +483,8 @@ static void dp_netdev_recirculate(struct 
dp_netdev_pmd_thread *,
 static void dp_netdev_disable_upcall(struct dp_netdev *);
 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
-struct dp_netdev *dp, int index,
-unsigned core_id, int numa_id);
+struct dp_netdev *dp, unsigned core_id,
+int numa_id);
 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_set_nonpmd(struct dp_netdev *dp);
 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
@@ -2787,8 +2785,7 @@ dp_netdev_set_nonpmd(struct dp_netdev *dp)
 struct dp_netdev_pmd_thread *non_pmd;
 
 non_pmd = xzalloc(sizeof *non_pmd);
-dp_netdev_configure_pmd(non_pmd, dp, 0, NON_PMD_CORE_ID,
-OVS_NUMA_UNSPEC);
+dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
 }
 
 /* Caller must have valid pointer to 'pmd'. */
@@ -2829,10 +2826,9 @@ dp_netdev_pmd_get_next(struct dp_netdev *dp, struct 
cmap_position *pos)
 /* Configures the 'pmd' based on the input argument. */
 static void
 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
-int index, unsigned core_id, int numa_id)
+unsigned core_id, int numa_id)
 {
 pmd->dp = dp;
-pmd->index = index;
 pmd->core_id = core_id;
 pmd->numa_id = numa_id;
 pmd->poll_cnt = 0;
@@ -3140,7 +3136,7 @@ dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int 
numa_id)
 for (i = 0; i < can_have; i++) {
 unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
 pmds[i] = xzalloc(sizeof **pmds);
-dp_netdev_configure_pmd(pmds[i], dp, i, core_id, numa_id);
+dp_netdev_configure_pmd(pmds[i], dp, core_id, numa_id);
 }
 
 /* Distributes rx queues of this numa node between new pmd threads. */
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH] checkpatch: Accept form feeds.

2016-04-18 Thread Daniele Di Proietto


On 18/04/2016 14:26, "Ben Pfaff" <b...@ovn.org> wrote:

>On Mon, Apr 18, 2016 at 02:02:37PM -0700, Daniele Di Proietto wrote:
>> CodingStyle.md says:
>> 
>> "Use form feeds (control+L) to divide long source files into logical
>> pieces.  A form feed should appear as the only character on a line."
>> 
>> checkpatch.py currently complains about form feed. For example, on
>> commit 2c06d9a927c5("ovstest: Add test-netlink-conntrack command."),
>> checkpatch.py returns:
>> 
>> W(140): Line has non-spaces leading whitespace
>> W(140): Line has trailing whitespace
>> +
>> 
>> W(177): Line has non-spaces leading whitespace
>> W(177): Line has trailing whitespace
>> +
>> 
>> W(199): Line has non-spaces leading whitespace
>> W(199): Line has trailing whitespace
>> +
>> 
>> This commit suppresses the two warnings for lines with form feeds as the
>> only character.
>> 
>> Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
>
>Thanks.
>
>Acked-by: Ben Pfaff <b...@ovn.org>

Thanks, pushed to master!

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v7 08/16] dpif-netdev: Add pmd thread local port cache for transmission.

2016-04-18 Thread Daniele Di Proietto


On 18/04/2016 07:50, "Ilya Maximets" <i.maxim...@samsung.com> wrote:

>On 08.04.2016 06:13, Daniele Di Proietto wrote:
>> Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
>> ---
>>  lib/dpif-netdev.c | 243
>>+++---
>>  1 file changed, 175 insertions(+), 68 deletions(-)
>> 
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 8c5893d..5d1cc43 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -185,6 +185,7 @@ static bool dpcls_lookup(const struct dpcls *cls,
>>   *
>>   *dp_netdev_mutex (global)
>>   *port_mutex
>> + *non_pmd_mutex
>>   */
>>  struct dp_netdev {
>>  const struct dpif_class *const class;
>> @@ -380,6 +381,13 @@ struct rxq_poll {
>>  struct ovs_list node;
>>  };
>>  
>> +/* Contained by struct dp_netdev_pmd_thread's 'port_cache' or
>>'tx_ports'. */
>> +struct tx_port {
>> +odp_port_t port_no;
>> +struct netdev *netdev;
>> +struct hmap_node node;
>> +};
>> +
>>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to
>>eliminate
>>   * the performance overhead of interrupt processing.  Therefore netdev
>>can
>>   * not implement rx-wait for these devices.  dpif-netdev needs to poll
>> @@ -436,10 +444,18 @@ struct dp_netdev_pmd_thread {
>>  atomic_int tx_qid;  /* Queue id used by this pmd
>>thread to
>>   * send packets on all netdevs */
>>  
>> -struct ovs_mutex poll_mutex;/* Mutex for poll_list. */
>> +struct ovs_mutex port_mutex;/* Mutex for 'poll_list' and
>>'tx_ports'. */
>>  /* List of rx queues to poll. */
>>  struct ovs_list poll_list OVS_GUARDED;
>> -int poll_cnt;   /* Number of elemints in
>>poll_list. */
>> +/* Number of elements in 'poll_list' */
>> +int poll_cnt;
>> +/* Map of 'tx_port's used for transmission.  Written by the main
>>thread,
>> + * read by the pmd thread. */
>> +struct hmap tx_ports OVS_GUARDED;
>> +
>> +/* Map of 'tx_port' used in the fast path. This is a thread-local
>>copy
>> + * 'tx_ports'. */
>> +struct hmap port_cache;
>>  
>>  /* Only a pmd thread can write on its own 'cycles' and 'stats'.
>>   * The main thread keeps 'stats_zero' and 'cycles_zero' as base
>> @@ -495,7 +511,7 @@ dp_netdev_pmd_get_next(struct dp_netdev *dp, struct
>>cmap_position *pos);
>>  static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
>>  static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int
>>numa_id);
>>  static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int
>>numa_id);
>> -static void dp_netdev_pmd_clear_poll_list(struct dp_netdev_pmd_thread
>>*pmd);
>> +static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread
>>*pmd);
>>  static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
>>   struct dp_netdev_port
>>*port);
>>  static void
>> @@ -509,6 +525,8 @@ static void dp_netdev_reset_pmd_threads(struct
>>dp_netdev *dp);
>>  static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
>>  static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
>>  static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
>> +static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
>> +OVS_REQUIRES(pmd->port_mutex);
>>  
>>  static inline bool emc_entry_alive(struct emc_entry *ce);
>>  static void emc_clear_entry(struct emc_entry *ce);
>> @@ -691,7 +709,7 @@ pmd_info_show_rxq(struct ds *reply, struct
>>dp_netdev_pmd_thread *pmd)
>>  ds_put_format(reply, "pmd thread numa_id %d core_id %u:\n",
>>pmd->numa_id, pmd->core_id);
>>  
>> -ovs_mutex_lock(>poll_mutex);
>> +ovs_mutex_lock(>port_mutex);
>>  LIST_FOR_EACH (poll, node, >poll_list) {
>>  const char *name = netdev_get_name(poll->port->netdev);
>>  
>> @@ -705,7 +723,7 @@ pmd_info_show_rxq(struct ds *reply, struct
>>dp_netdev_pmd_thread *pmd)
>>  ds_put_format(reply, " %d",
>>netdev_rxq_get_queue_id(poll->rx));
>>  prev_name = name;
>>  }
>> -ovs_mutex_unlock(>poll_mutex);
>> +ovs_mutex_unlock(>port_mutex);
>>  ds_put_c

Re: [ovs-dev] [PATCH v2 00/15] Userspace (DPDK) connection tracker

2016-04-18 Thread Daniele Di Proietto
Hi Antonio,

thanks for the feedback

On 18/04/2016 09:56, "Fischetti, Antonio" <antonio.fische...@intel.com>
wrote:

>Hi Daniele,
>I just started to have a look to your new v2 patch set.
>A minor comment - I know this is a bit of nit-picking - but I ran
>utilities/checkpatch.py and I got some output like
>
>patch 4/15
>W(1692): Line has trailing whitespace
>+static inline void ct_lock_lock(struct ct_lock *lock)
>
>patch 5/15
>W(166): Line is greater than 79-characters long
>+threads[i].thread = ovs_thread_create("ct_thread",
>ct_thread_main, [i]);
>
>W(191): Line is greater than 79-characters long
>+{"benchmark", "n_threads n_pkts batch_size [change_connection]", 3,
>4, test_benchmark},
>
>patch 6/15
>W(52): Line is greater than 79-characters long
>+ovs_fatal(0, "batch_size must be between 1 and
>NETDEV_MAX_BURST(%u)",

You're right there are several genuine errors reported by checkpatch.py.

I'll fix them.

>
>patches 7/15 and 8/15, 10/15, 11/15 - as there's no comment into the
>patch, maybe 
>they just need an empty line before
>³Signed-off-by: Daniele Di Proietto ²?
>The output is
>E: No signatures found.
>E: Too many signoffs; are you missing Co-authored-by lines?

I don't see this error.  How did you get the patches?  Did you use
patchwork?
In any case, as you point out it is a false positive.

I found another small checkpatch issue with this series and I've sent a
patch here:

http://openvswitch.org/pipermail/dev/2016-April/069795.html

>
>I'll go on and have a look into the code, I hope I provide some useful
>feedback.
>
>Thanks,
>Antonio

Thanks for the report,

Daniele

>
>> -Original Message-
>> From: dev [mailto:dev-boun...@openvswitch.org] On Behalf Of Daniele Di
>> Proietto
>> Sent: Saturday, April 16, 2016 1:03 AM
>> To: dev@openvswitch.org
>> Subject: [ovs-dev] [PATCH v2 00/15] Userspace (DPDK) connection tracker
>> 
>> This series aims to implement the ct() action for the dpif-netdev
>>datapath.
>> The bulk of the code is in the new conntrack module: it contains some
>>packet
>> parsing code, some lookup tables and the logic to implements all the ct
>>bits.
>> 
>> The conntrack module is helped by conntrack-tcp, for TCP window and
>>flags
>> tracking: the bulk of the code of this submodule is from the FreeBSD's
>>pf
>> subsystem, therefore is BSD licensed.
>> 
>> The rest of the series integrates the connection tracker with the rest
>>of
>> OVS: the ct() action is implemented in dpif-netdev, and the debugging
>> interfaces required by dpctl/{dump,flush}-conntrack are implemented.
>> 
>> Besides adding some unit tests, this series ports the existing conntrack
>> system test to the userspace datapath.  Some small modifications are
>> required to pass the testsuite, and some tests still have to be skipped.
>> 
>> On newer kernels the userspace testsuite has some problems with
>>offloads,
>> so a workaround is included.
>> 
>> This can also be downloaded at:
>> 
>> https://github.com/ddiproietto/ovs/tree/userconntrack_20160415
>> 
>> Any feedback is appreciated, thanks.
>> 
>> v1 -> v2:
>> * Fixed bug in tcp_get_wscale(), related to TCP options parsing.
>> * Changed names of ICMP constants: now they're different from Linux and
>>   FreeBSD.
>> * Fixed bug in parse_ipv6_ext_hdrs().
>> * Used ALWAYS_INLINE in parse_vlan and parse_ethertype, to avoid a
>>   performance regression in miniflow_extract().
>> * Updated copyright info in COPYING and debian/copyright.in.
>> * Rebased.
>> * Changed batching strategy in conntrack_execute() to allow a newly
>>   created connection to be picked up by packets in the same batch.
>> * Added an ovs-test module to throw pcap files at the connection
>>tracker.
>> * Added a workaround for the userspace testsuite on new kernels and a
>>tcp
>>   non-conntrack test.
>> 
>> Daniele Di Proietto (15):
>>   packets: Define ICMP types.
>>   flow: Export parse_ipv6_ext_hdrs().
>>   flow: Introduce parse_dl_type().
>>   conntrack: New userspace connection tracker.
>>   tests: Add very simple conntrack benchmark.
>>   tests: Add test-conntrack pcap test.
>>   conntrack: Implement flush function.
>>   conntrack: Implement dumping to ct_entry.
>>   dpif-netdev: Execute conntrack action.
>>   dpif-netdev: Implement conntrack dump functions.
>>   dpif-netdev: Implement conntrack flush interface.
>>   tests: Add conntrack ofproto-dpif tests.
>>   system-tests

[ovs-dev] [PATCH] checkpatch: Accept form feeds.

2016-04-18 Thread Daniele Di Proietto
CodingStyle.md says:

"Use form feeds (control+L) to divide long source files into logical
pieces.  A form feed should appear as the only character on a line."

checkpatch.py currently complains about form feed. For example, on
commit 2c06d9a927c5("ovstest: Add test-netlink-conntrack command."),
checkpatch.py returns:

W(140): Line has non-spaces leading whitespace
W(140): Line has trailing whitespace
+

W(177): Line has non-spaces leading whitespace
W(177): Line has trailing whitespace
+

W(199): Line has non-spaces leading whitespace
W(199): Line has trailing whitespace
+

This commit suppresses the two warnings for lines with form feeds as the
only character.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 utilities/checkpatch.py | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py
index dbdcbc8..b641560 100755
--- a/utilities/checkpatch.py
+++ b/utilities/checkpatch.py
@@ -47,6 +47,7 @@ __regex_added_line = re.compile(r'^\+{1,2}[^\+][\w\W]*')
 __regex_leading_with_whitespace_at_all = re.compile(r'^\s+')
 __regex_leading_with_spaces = re.compile(r'^ +[\S]+')
 __regex_trailing_whitespace = re.compile(r'[^\S]+$')
+__regex_single_line_feed = re.compile(r'^\f$')
 __regex_for_if_missing_whitespace = re.compile(r'(if|for|while)[\(]')
 __regex_for_if_too_much_whitespace = re.compile(r'(if|for|while)  +[\(]')
 __regex_for_if_parens_whitespace = re.compile(r'(if|for|while) \( +[\s\S]+\)')
@@ -75,8 +76,10 @@ def leading_whitespace_is_spaces(line):
 """
 if skip_leading_whitespace_check:
 return True
-if __regex_leading_with_whitespace_at_all.search(line) is not None:
+if (__regex_leading_with_whitespace_at_all.search(line) is not None and
+__regex_single_line_feed.search(line) is None):
 return __regex_leading_with_spaces.search(line) is not None
+
 return True
 
 
@@ -85,7 +88,8 @@ def trailing_whitespace_or_crlf(line):
 """
 if skip_trailing_whitespace_check:
 return False
-return __regex_trailing_whitespace.search(line) is not None
+return (__regex_trailing_whitespace.search(line) is not None and
+__regex_single_line_feed.search(line) is None)
 
 
 def if_and_for_whitespace_checks(line):
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH] datapath-windows: Fix bug in OvsTcpGetWscale().

2016-04-15 Thread Daniele Di Proietto
The userspace conntrack had a bug in tcp_wscale_get(), where the length
of an option would be read from the third octet of the option TLV
instead of the second.  This could cause an incorrect wscale value to
be returned, and it would at least impact performance.

Also use 'int' instead of 'unsigned' for 'len', since the value can be
negative.

CC: Sairam Venugopal <vsai...@vmware.com>
Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---

I tested a similar fix on the userspace connection tracker, but I didn't
compile this for the windows datapath.

---
 datapath-windows/ovsext/Conntrack-tcp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datapath-windows/ovsext/Conntrack-tcp.c 
b/datapath-windows/ovsext/Conntrack-tcp.c
index 3e25ba5..340c469 100644
--- a/datapath-windows/ovsext/Conntrack-tcp.c
+++ b/datapath-windows/ovsext/Conntrack-tcp.c
@@ -166,7 +166,7 @@ OvsConntrackValidateTcpFlags(const TCPHdr *tcp)
 static __inline uint8_t
 OvsTcpGetWscale(const TCPHdr *tcp)
 {
-unsigned len = tcp->doff * 4 - sizeof *tcp;
+int len = tcp->doff * 4 - sizeof *tcp;
 const uint8_t *opt = (const uint8_t *)(tcp + 1);
 uint8_t wscale = 0;
 uint8_t optlen;
@@ -185,7 +185,7 @@ OvsTcpGetWscale(const TCPHdr *tcp)
 wscale |= CT_WSCALE_FLAG;
 /* fall through */
 default:
-optlen = opt[2];
+optlen = opt[1];
 if (optlen < 2) {
 optlen = 2;
 }
@@ -529,4 +529,4 @@ OvsNewTcpConntrack(const TCPHdr *tcp,
 OvsConntrackUpdateExpiration(newconn, now, CT_ENTRY_TIMEOUT);
 
 return >up;
-}
\ No newline at end of file
+}
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v2 15/15] system-tests: Run conntrack tests with userspace

2016-04-15 Thread Daniele Di Proietto
The userspace connection tracker doesn't support ALGs, frag reassembly
or NAT yet, so skip those tests.

Also, connection tracking state input from a local port is not possible
in userspace.

Finally, the userspace datapath pads all frames with 0, to make them at
least 64 bytes.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 tests/system-kmod-macros.at  | 28 +++
 tests/system-traffic.at  | 49 ++--
 tests/system-userspace-macros.at | 45 +---
 3 files changed, 107 insertions(+), 15 deletions(-)

diff --git a/tests/system-kmod-macros.at b/tests/system-kmod-macros.at
index 8e60929..4cecc23 100644
--- a/tests/system-kmod-macros.at
+++ b/tests/system-kmod-macros.at
@@ -65,3 +65,31 @@ m4_define([CHECK_CONNTRACK],
  on_exit 'ovstest test-netlink-conntrack flush'
 ]
 )
+
+# CHECK_CONNTRACK_ALG()
+#
+# Perform requirements checks for running conntrack ALG tests. The kernel
+# always supports ALG, so no check is needed.
+#
+m4_define([CHECK_CONNTRACK_ALG])
+
+# CHECK_CONNTRACK_FRAG()
+#
+# Perform requirements checks for running conntrack fragmentations tests.
+# The kernel always supports fragmentation, so no check is needed.
+m4_define([CHECK_CONNTRACK_FRAG])
+
+# CHECK_CONNTRACK_LOCAL_STACK()
+#
+# Perform requirements checks for running conntrack tests with local stack.
+# The kernel always supports reading the connection state of an skb coming
+# from an internal port, without an explicit ct() action, so no check is
+# needed.
+m4_define([CHECK_CONNTRACK_LOCAL_STACK])
+
+# CHECK_CONNTRACK_NAT()
+#
+# Perform requirements checks for running conntrack NAT tests. The kernel
+# always supports NAT, so no check is needed.
+#
+m4_define([CHECK_CONNTRACK_NAT])
diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index c8fbe0d..241175b 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -579,7 +579,8 @@ NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 3 -T 1 
--retry-connrefused -v -o wget0
 dnl (again) HTTP requests from p0->p1 should work fine.
 NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 3 -T 1 --retry-connrefused -v -o 
wget0.log])
 
-AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl
+dnl The userspace connection tracker here has a different internal TCP state 
(CLOSING). Ignore that.
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | grep -v 
"state=CLOSING"], [0], [dnl
 
tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),zone=1,protoinfo=(state=SYN_SENT)
 
tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),zone=2,protoinfo=(state=TIME_WAIT)
 ])
@@ -589,6 +590,7 @@ AT_CLEANUP
 
 AT_SETUP([conntrack - multiple zones, local])
 CHECK_CONNTRACK()
+CHECK_CONNTRACK_LOCAL_STACK()
 OVS_TRAFFIC_VSWITCHD_START()
 
 ADD_NAMESPACES(at_ns0)
@@ -636,6 +638,7 @@ AT_CLEANUP
 
 AT_SETUP([conntrack - multiple namespaces, internal ports])
 CHECK_CONNTRACK()
+CHECK_CONNTRACK_LOCAL_STACK()
 OVS_TRAFFIC_VSWITCHD_START(
[set-fail-mode br0 secure -- ])
 
@@ -676,6 +679,7 @@ AT_CLEANUP
 
 AT_SETUP([conntrack - multi-stage pipeline, local])
 CHECK_CONNTRACK()
+CHECK_CONNTRACK_LOCAL_STACK()
 OVS_TRAFFIC_VSWITCHD_START()
 
 ADD_NAMESPACES(at_ns0)
@@ -969,11 +973,11 @@ dnl UDP packets from ns0->ns1 should solicit "destination 
unreachable" response.
 NS_CHECK_EXEC([at_ns0], [bash -c "echo a | nc $NC_EOF_OPT -u 10.1.1.2 1"])
 
 AT_CHECK([ovs-appctl revalidator/purge], [0])
-AT_CHECK([ovs-ofctl dump-flows br0 | ofctl_strip | sort | grep -v drop], [0], 
[dnl
- n_packets=1, n_bytes=44, priority=100,udp,in_port=1 
actions=ct(commit,exec(load:0x1->NXM_NX_CT_MARK[[]])),output:2
- n_packets=1, n_bytes=72, 
priority=100,ct_state=+rel+trk,ct_mark=0x1,icmp,in_port=2 actions=output:1
- n_packets=1, n_bytes=72, priority=100,ct_state=-trk,icmp,in_port=2 
actions=ct(table=0)
- n_packets=2, n_bytes=84, priority=10,arp actions=NORMAL
+AT_CHECK([ovs-ofctl dump-flows br0 | ofctl_strip | sort | grep -v drop | sed 
-e 's/n_bytes=[[0-9]]*/n_bytes=/g'], [0], [dnl
+ n_packets=1, n_bytes=, priority=100,udp,in_port=1 
actions=ct(commit,exec(load:0x1->NXM_NX_CT_MARK[[]])),output:2
+ n_packets=1, n_bytes=, 
priority=100,ct_state=+rel+trk,ct_mark=0x1,icmp,in_port=2 actions=output:1
+ n_packets=1, n_bytes=, priority=100,ct_state=-trk,icmp,in_port=2 
actions=ct(table=0)
+ n_packets=2, n_bytes=, priority=10,arp actions=NORMAL
 NXST_FLOW reply:
 ])
 
@@ -1027,6 +1031,7 @@ AT_CLEANUP
 AT_SETUP([conntrack - FTP])
 AT_SKIP_IF([test $HAVE_PYFTPDLIB = no])
 CHECK_CONNTRACK()
+CHECK_CONNTRACK_ALG()
 OVS_TRAFFIC_VSWITCHD_START()
 
 ADD_NAMESPACES(at_ns0, at_ns1)
@@ -1109,6 +1114,7 @@ AT_CLEANUP
 AT_SETUP([conntrack - IPv6 FTP])
 AT_SKIP_IF([test $HAVE_PYFTPDLIB = no])
 CHECK_CONNTRACK()
+CHECK_CONNTRACK_ALG()
 OVS_TRAFFIC_VSWITCHD_START()
 
 A

[ovs-dev] [PATCH v2 04/15] conntrack: New userspace connection tracker.

2016-04-15 Thread Daniele Di Proietto
This commit adds the conntrack module.

It is a connection tracker that resides entirely in userspace.  Its
primary user will be the dpif-netdev datapath.

The module main goal is to provide conntrack_execute(), which offers a
convenient interface to implement the datapath ct() action.

The conntrack module uses two submodules to deal with the l4 protocol
details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).

The conntrack-tcp submodule implementation is adapted from FreeBSD's pf
subsystem, therefore it's BSD licensed.  It has been slightly altered to
match the OVS coding style and to allow the pickup of already
established connections.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 COPYING |   1 +
 debian/copyright.in |   4 +
 lib/automake.mk |   5 +
 lib/conntrack-other.c   |  91 ++
 lib/conntrack-private.h |  77 +
 lib/conntrack-tcp.c | 476 +++
 lib/conntrack.c | 851 
 lib/conntrack.h | 144 
 8 files changed, 1649 insertions(+)
 create mode 100644 lib/conntrack-other.c
 create mode 100644 lib/conntrack-private.h
 create mode 100644 lib/conntrack-tcp.c
 create mode 100644 lib/conntrack.c
 create mode 100644 lib/conntrack.h

diff --git a/COPYING b/COPYING
index 308e3ea..afb98b9 100644
--- a/COPYING
+++ b/COPYING
@@ -25,6 +25,7 @@ License, version 2.
 The following files are licensed under the 2-clause BSD license.
 include/windows/getopt.h
 lib/getopt_long.c
+lib/conntrack-tcp.c
 
 The following files are licensed under the 3-clause BSD-license
 include/windows/netinet/icmp6.h
diff --git a/debian/copyright.in b/debian/copyright.in
index 57d007a..a15f4dd 100644
--- a/debian/copyright.in
+++ b/debian/copyright.in
@@ -21,6 +21,9 @@ Upstream Copyright Holders:
Copyright (c) 2014 Michael Chapman
Copyright (c) 2014 WindRiver, Inc.
Copyright (c) 2014 Avaya, Inc.
+   Copyright (c) 2001 Daniel Hartmeier
+   Copyright (c) 2002 - 2008 Henning Brauer
+   Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org>
 
 License:
 
@@ -90,6 +93,7 @@ License:
lib/getopt_long.c
include/windows/getopt.h
datapath-windows/ovsext/Conntrack-tcp.c
+   lib/conntrack-tcp.c
 
 * The following files are licensed under the 3-clause BSD-license
 
diff --git a/lib/automake.mk b/lib/automake.mk
index 1ec2115..ba30442 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -47,6 +47,11 @@ lib_libopenvswitch_la_SOURCES = \
lib/compiler.h \
lib/connectivity.c \
lib/connectivity.h \
+   lib/conntrack-private.h \
+   lib/conntrack-tcp.c \
+   lib/conntrack-other.c \
+   lib/conntrack.c \
+   lib/conntrack.h \
lib/coverage.c \
lib/coverage.h \
lib/crc32c.c \
diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c
new file mode 100644
index 000..65d02a9
--- /dev/null
+++ b/lib/conntrack-other.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include 
+
+#include "conntrack-private.h"
+#include "dp-packet.h"
+
+enum other_state {
+OTHERS_FIRST,
+OTHERS_MULTIPLE,
+OTHERS_BIDIR,
+};
+
+struct conn_other {
+struct conn up;
+enum other_state state;
+};
+
+static const long long other_timeouts[] = {
+[OTHERS_FIRST] = 60 * 1000,
+[OTHERS_MULTIPLE] = 60 * 1000,
+[OTHERS_BIDIR] = 30 * 1000,
+};
+
+static struct conn_other *
+conn_other_cast(const struct conn *conn)
+{
+return CONTAINER_OF(conn, struct conn_other, up);
+}
+
+static void
+update_expiration(struct conn_other *conn, long long now)
+{
+conn->up.expiration = now + other_timeouts[conn->state];
+}
+
+static enum ct_update_res
+other_conn_update(struct conn *conn_, struct dp_packet *pkt OVS_UNUSED,
+bool reply, long long now)
+{
+struct conn_other *conn = conn_other_cast(conn_);
+
+if (reply && conn->state != OTHERS_BIDIR) {
+conn->state = OTHERS_BIDIR;
+} else if (conn->state == OTHERS_FIRST) {
+conn->state = OTHERS_MULTIPLE;
+}
+
+update_expiration(conn, now);
+
+return CT_UPDATE_VALID;
+}
+
+static bool
+other_valid_new(struct dp_packet *pkt OVS_UNUSED)
+{
+return true;
+}
+
+static struct conn *
+other_new_conn(st

[ovs-dev] [PATCH v2 13/15] system-tests: Disable offloads in userspace tests.

2016-04-15 Thread Daniele Di Proietto
The system userspace testsuite uses the userspace datapath with
netdev-linux devices, connected to veth pairs with the AF_PACKET socket:

 (veth pair) (AF_PACKET)
TCP stack -> p0 ---> ovs-p0  -> netdev-linux (userspace OVS)

Unfortunately this configuration has some problems with offloads: a
packet generated by the TCP stack maybe sent to p0 without being
checksummed or segmented. The AF_PACKET socket, by default, ignores the
offloads and just transmits the data of the packets to userspace, but:

1. The packet may need GSO, so the data will be too big to be received
   by the userspace datapath
2. The packet might have incomplete checksums, so it will likely be
   discarded by the receiver.

Problem 1 causes TCP connections to see a congestion window smaller than
the MTU, which hurts performance but doesn't prevent communication.

Problem 2 was hidden in the testsuite by a Linux kernel bug, fixed by
commit ce8c839b74e3("veth: don’t modify ip_summed; doing so treats
packets with bad checksums as good").  In the kernels that include the
fix, the userspace datapath is able to process pings, but not tcp or udp
data.

Unfortunately I couldn't find a way to ask the AF_PACKET to perform
offloads in kernel.  A possible fix would be to use the PACKET_VNET_HDR
sockopt and perform the offloads in userspace.

Until a proper fix is worked out for netdev-linux, this commit disables
offloads on the non-OVS side of the veth pair, as a workaround.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 tests/system-common-macros.at|  1 +
 tests/system-kmod-macros.at  |  7 +++
 tests/system-userspace-macros.at | 18 ++
 3 files changed, 26 insertions(+)

diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at
index 92569f9..60f86eb 100644
--- a/tests/system-common-macros.at
+++ b/tests/system-common-macros.at
@@ -67,6 +67,7 @@ m4_define([ADD_INT],
 #
 m4_define([ADD_VETH],
 [ AT_CHECK([ip link add $1 type veth peer name ovs-$1 || return 77])
+  CONFIGURE_VETH_OFFLOADS([$1])
   AT_CHECK([ip link set $1 netns $2])
   AT_CHECK([ip link set dev ovs-$1 up])
   AT_CHECK([ovs-vsctl add-port $3 ovs-$1])
diff --git a/tests/system-kmod-macros.at b/tests/system-kmod-macros.at
index 20ee7bf..8e60929 100644
--- a/tests/system-kmod-macros.at
+++ b/tests/system-kmod-macros.at
@@ -44,6 +44,13 @@ m4_define([OVS_TRAFFIC_VSWITCHD_STOP],
AT_CHECK([:; $2])
   ])
 
+# CONFIGURE_VETH_OFFLOADS([VETH])
+#
+# The kernel datapath has no problem with offloads and veths. Nothing
+# to do here.
+m4_define([CONFIGURE_VETH_OFFLOADS],
+)
+
 # CHECK_CONNTRACK()
 #
 # Perform requirements checks for running conntrack tests, and flush the
diff --git a/tests/system-userspace-macros.at b/tests/system-userspace-macros.at
index 4fed777..c09a4aa 100644
--- a/tests/system-userspace-macros.at
+++ b/tests/system-userspace-macros.at
@@ -40,6 +40,24 @@ m4_define([OVS_TRAFFIC_VSWITCHD_STOP],
AT_CHECK([:; $2])
   ])
 
+# CONFIGURE_VETH_OFFLOADS([VETH])
+#
+# Disable TX offloads for veths.  The userspace datapath uses the AF_PACKET
+# socket to receive packets for veths.  Unfortunately, the AF_PACKET socket
+# doesn't play well with offloads:
+# 1. GSO packets are received without segmentation and therefore discarded.
+# 2. Packets with offloaded partial checksum are received with the wrong
+#checksum, therefore discarded by the receiver.
+#
+# By disabling tx offloads in the non-OVS side of the veth peer we make sure
+# that the AF_PACKET socket will not receive bad packets.
+#
+# This is a workaround, and should be removed when offloads are properly
+# supported in netdev-linux.
+m4_define([CONFIGURE_VETH_OFFLOADS],
+[ethtool -K $1 tx off]
+)
+
 # CHECK_CONNTRACK()
 #
 # Perform requirements checks for running conntrack tests, and flush the
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v2 12/15] tests: Add conntrack ofproto-dpif tests.

2016-04-15 Thread Daniele Di Proietto
While the system testsuite already has connection tracking tests, it
will be still useful to add some to the standard testsuite because:

* They're run more often by developers.
* Some of them are more interesting for the userspace datapath.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 tests/ofproto-dpif.at | 678 ++
 1 file changed, 678 insertions(+)

diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index 90d7702..cf60da6 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -7420,5 +7420,683 @@ AT_CHECK([grep "Final flow:" stdout], [0], [Final flow: 
unchanged
 AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(100)'], [0], [stdout])
 AT_CHECK([grep "Final flow:" stdout], [0], [Final flow: unchanged
 ])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
+
+AT_SETUP([ofproto-dpif - conntrack - controller])
+OVS_VSWITCHD_START
+
+add_of_ports br0 1 2
+
+AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg vconn:info ofproto_dpif:info])
+
+dnl Allow new connections on p1->p2, but not on p2->p1.
+AT_DATA([flows.txt], [dnl
+priority=1,action=drop
+priority=10,arp,action=normal
+priority=100,in_port=1,udp,action=ct(commit,zone=0),controller
+priority=100,in_port=2,ct_state=-trk,udp,action=ct(table=0,zone=0)
+priority=100,in_port=2,ct_state=+trk+est-new,udp,action=controller
+])
+
+AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
+
+AT_CAPTURE_FILE([ofctl_monitor.log])
+AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl -P nxt_packet_in --detach 
--no-chdir --pidfile 2> ofctl_monitor.log])
+
+AT_CHECK([ovs-appctl netdev-dummy/receive p2 
'in_port(2),eth(src=50:54:00:00:00:0a,dst=50:54:00:00:00:09),eth_type(0x0800),ipv4(src=10.1.1.2,dst=10.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=2,dst=1)'])
+
+dnl OK, now start a new connection from port 1.
+AT_CHECK([ovs-appctl netdev-dummy/receive p1 
'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.1.1.1,dst=10.1.1.2,proto=17,tos=0,ttl=64,frag=no),udp(src=1,dst=2)'])
+
+dnl Now try a reply from port 2.
+AT_CHECK([ovs-appctl netdev-dummy/receive p2 
'in_port(2),eth(src=50:54:00:00:00:0a,dst=50:54:00:00:00:09),eth_type(0x0800),ipv4(src=10.1.1.2,dst=10.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=2,dst=1)'])
+
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 4])
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+dnl Check this output. We only see the latter two packets, not the first.
+dnl Note that the first packet doesn't have the ct_state bits set. This
+dnl happens because the ct_state field is available only after recirc.
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=60 in_port=1 (via action) 
data_len=60 (unbuffered)
+udp,vlan_tci=0x,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=0,nw_ttl=64,tp_src=1,tp_dst=2
 udp_csum:0
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=60 
ct_state=est|rpl|trk,in_port=2 (via action) data_len=60 (unbuffered)
+udp,vlan_tci=0x,dl_src=50:54:00:00:00:0a,dl_dst=50:54:00:00:00:09,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,tp_src=2,tp_dst=1
 udp_csum:0
+])
+
+AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl -P nxt_packet_in --detach 
--no-chdir --pidfile 2> ofctl_monitor.log])
+
+dnl OK, now start a second connection from port 1
+AT_CHECK([ovs-appctl netdev-dummy/receive p1 
'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.1.1.1,dst=10.1.1.2,proto=17,tos=0,ttl=64,frag=no),udp(src=3,dst=4)'])
+
+dnl Now try a reply from port 2.
+AT_CHECK([ovs-appctl netdev-dummy/receive p2 
'in_port(2),eth(src=50:54:00:00:00:0a,dst=50:54:00:00:00:09),eth_type(0x0800),ipv4(src=10.1.1.2,dst=10.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=4,dst=3)'])
+
+
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 4])
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+dnl Check this output. We should see both packets
+dnl Note that the first packet doesn't have the ct_state bits set. This
+dnl happens because the ct_state field is available only after recirc.
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=60 in_port=1 (via action) 
data_len=60 (unbuffered)
+udp,vlan_tci=0x,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=0,nw_ttl=64,tp_src=3,tp_dst=4
 udp_csum:0
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=60 
ct_state=est|rpl|trk,in_port=2 (via action) data_len=60 (unbuffered)
+udp,vlan_tci=0x,dl_src=50:54:00:00:00:0a,dl_dst=50:54:00:00:00:09,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,tp_src=4,tp_dst=3
 udp_csum:0
+])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
+
+AT_SETUP([ofproto-dpif - conntrack - ipv6])
+OVS_VSWITCHD_START
+
+add_of_ports br0 1 2
+
+AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg vconn:info ofproto_dpif:info])
+
+dnl Al

[ovs-dev] [PATCH v2 08/15] conntrack: Implement dumping to ct_entry.

2016-04-15 Thread Daniele Di Proietto
Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/conntrack-private.h |   3 ++
 lib/conntrack-tcp.c |  34 +
 lib/conntrack.c | 125 
 lib/conntrack.h |  16 +++
 4 files changed, 178 insertions(+)

diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index e668c44..1f04dc2 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -21,6 +21,7 @@
 #include 
 #include 
 
+#include "ct-dpif.h"
 #include "hmap.h"
 #include "openvswitch/types.h"
 #include "packets.h"
@@ -69,6 +70,8 @@ struct ct_l4_proto {
 bool (*valid_new)(struct dp_packet *pkt);
 enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet *pkt,
   bool reply, long long now);
+void (*conn_get_protoinfo)(const struct conn *,
+   struct ct_dpif_protoinfo *);
 };
 
 extern struct ct_l4_proto ct_proto_tcp;
diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c
index 4d80038..f723fb2 100644
--- a/lib/conntrack-tcp.c
+++ b/lib/conntrack-tcp.c
@@ -469,8 +469,42 @@ tcp_new_conn(struct dp_packet *pkt, long long now)
 return >up;
 }
 
+static uint8_t
+tcp_peer_to_protoinfo_flags(const struct tcp_peer *peer)
+{
+uint8_t res = 0;
+
+if (peer->wscale & CT_WSCALE_FLAG) {
+res |= CT_DPIF_TCPF_WINDOW_SCALE;
+}
+
+if (peer->wscale & CT_WSCALE_UNKNOWN) {
+res |= CT_DPIF_TCPF_BE_LIBERAL;
+}
+
+return res;
+}
+
+static void
+tcp_conn_get_protoinfo(const struct conn *conn_,
+   struct ct_dpif_protoinfo *protoinfo)
+{
+const struct conn_tcp *conn = conn_tcp_cast(conn_);
+
+protoinfo->proto = IPPROTO_TCP;
+protoinfo->tcp.state_orig = conn->peer[0].state;
+protoinfo->tcp.state_reply = conn->peer[1].state;
+
+protoinfo->tcp.wscale_orig = conn->peer[0].wscale & CT_WSCALE_MASK;
+protoinfo->tcp.wscale_reply = conn->peer[1].wscale & CT_WSCALE_MASK;
+
+protoinfo->tcp.flags_orig = tcp_peer_to_protoinfo_flags(>peer[0]);
+protoinfo->tcp.flags_reply = tcp_peer_to_protoinfo_flags(>peer[0]);
+}
+
 struct ct_l4_proto ct_proto_tcp = {
 .new_conn = tcp_new_conn,
 .valid_new = tcp_valid_new,
 .conn_update = tcp_conn_update,
+.conn_get_protoinfo = tcp_conn_get_protoinfo,
 };
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 7913e76..7dc896d 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -24,6 +24,7 @@
 
 #include "bitmap.h"
 #include "conntrack-private.h"
+#include "ct-dpif.h"
 #include "dp-packet.h"
 #include "flow.h"
 #include "hmap.h"
@@ -850,6 +851,130 @@ delete_conn(struct conn *conn)
 free(conn);
 }
 
+static void
+ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
+ union ct_dpif_inet_addr *b,
+ ovs_be16 dl_type)
+{
+if (dl_type == htons(ETH_TYPE_IP)) {
+b->ip = a->ipv4_aligned;
+} else if (dl_type == htons(ETH_TYPE_IPV6)){
+b->in6 = a->ipv6_aligned;
+}
+}
+
+static void
+conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
+{
+if (key->dl_type == htons(ETH_TYPE_IP)) {
+tuple->l3_type = AF_INET;
+} else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
+tuple->l3_type = AF_INET6;
+}
+tuple->ip_proto = key->nw_proto;
+ct_endpoint_to_ct_dpif_inet_addr(>src.addr, >src,
+ key->dl_type);
+ct_endpoint_to_ct_dpif_inet_addr(>dst.addr, >dst,
+ key->dl_type);
+
+if (key->nw_proto == IPPROTO_ICMP) {
+tuple->icmp_id = key->src.port;
+/* ICMP type and code are not tracked */
+tuple->icmp_type = 0;
+tuple->icmp_code = 0;
+} else {
+tuple->src_port = key->src.port;
+tuple->dst_port = key->dst.port;
+}
+}
+
+static void
+conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
+  long long now)
+{
+struct ct_l4_proto *class;
+long long expiration;
+memset(entry, 0, sizeof *entry);
+conn_key_to_tuple(>key, >tuple_orig);
+conn_key_to_tuple(>rev_key, >tuple_reply);
+
+entry->zone = conn->key.zone;
+entry->mark = conn->mark;
+
+memcpy(>labels, >label, sizeof(entry->labels));
+/* Not implemented yet */
+entry->timestamp.start = 0;
+entry->timestamp.stop = 0;
+
+expiration = conn->expiration - now;
+entry->timeout = (expiration > 0) ? expiration / 1000: 0;
+
+class = l4_protos[conn->key.nw_proto];
+if (class->conn_get_protoinfo) {
+class->conn_get_protoinfo(conn, >protoinfo);
+}

[ovs-dev] [PATCH v2 06/15] tests: Add test-conntrack pcap test.

2016-04-15 Thread Daniele Di Proietto
Simple program that runs the packet in a pcap file through the
connection tracker and prints the 'ct_state' for each packet.

E.g. the line:

`./test/ovstest test-conntrack capture.pcap 2`

sends the packets in `capture.pcap` to the connection tracker, 2 per
call.

Useful for debugging.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 tests/test-conntrack.c | 65 ++
 1 file changed, 65 insertions(+)

diff --git a/tests/test-conntrack.c b/tests/test-conntrack.c
index 414d7dc..5292320 100644
--- a/tests/test-conntrack.c
+++ b/tests/test-conntrack.c
@@ -23,6 +23,7 @@
 #include "netdev.h"
 #include "ovs-thread.h"
 #include "ovstest.h"
+#include "pcap-file.h"
 #include "timeval.h"
 
 static const char payload[] = 
"5054000a505400090800451c00110a0101010a010102000100020008";
@@ -141,6 +142,66 @@ test_benchmark(struct ovs_cmdl_context *ctx)
 pthread_barrier_destroy();
 free(threads);
 }
+
+static void
+test_pcap(struct ovs_cmdl_context *ctx)
+{
+struct dp_packet *pkts[NETDEV_MAX_BURST];
+size_t total_count, i, pkt_count, batch_size;
+FILE *pcap;
+int err;
+
+pcap = ovs_pcap_open(ctx->argv[1], "rb");
+if (!pcap) {
+return;
+}
+
+batch_size = 1;
+if (ctx->argc > 2) {
+batch_size = strtoul(ctx->argv[2], NULL, 0);
+if (batch_size == 0 || batch_size > NETDEV_MAX_BURST) {
+ovs_fatal(0, "batch_size must be between 1 and 
NETDEV_MAX_BURST(%u)",
+  NETDEV_MAX_BURST);
+}
+}
+
+conntrack_init();
+total_count = 0;
+for (;;) {
+for (i = 0; i < batch_size; i++) {
+struct flow dummy_flow;
+
+err = ovs_pcap_read(pcap, [i], NULL);
+if (err) {
+break;
+}
+flow_extract(pkts[i], _flow);
+}
+
+pkt_count = i;
+if (pkt_count == 0) {
+break;
+}
+
+conntrack_execute(, pkts, pkt_count, true, 0, NULL, NULL, NULL);
+
+for (i = 0; i < pkt_count; i++) {
+struct ds ds = DS_EMPTY_INITIALIZER;
+
+total_count++;
+
+format_flags(, ct_state_to_string, pkts[i]->md.ct_state, '|');
+printf("%"PRIuSIZE": %s\n", total_count, ds_cstr());
+
+dp_packet_delete(pkts[i]);
+ds_destroy();
+}
+if (err) {
+break;
+}
+}
+conntrack_destroy();
+}
 
 static const struct ovs_cmdl_command commands[] = {
 /* Connection tracker tests. */
@@ -149,6 +210,10 @@ static const struct ovs_cmdl_command commands[] = {
  * is '1', each packet in a batch will have a different source and
  * destination port */
 {"benchmark", "n_threads n_pkts batch_size [change_connection]", 3, 4, 
test_benchmark},
+/* Reads packets from 'file' and sends them to the connection tracker,
+ * 'batch_size' (1 by default) per call, with the commit flag set.
+ * Prints the ct_state of each packet. */
+{"pcap", "file [batch_size]", 1, 2, test_pcap},
 
 {NULL, NULL, 0, 0, NULL},
 };
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v2 14/15] system-tests: Add tcp simple test.

2016-04-15 Thread Daniele Di Proietto
Useful to test the datapath ability to forward tcp packets without the
complexity of connection tracking.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 tests/system-traffic.at | 20 
 1 file changed, 20 insertions(+)

diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index dceae15..c8fbe0d 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -23,6 +23,26 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 
10.1.1.2 | FORMAT_PING
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([datapath - http between two ports])
+OVS_TRAFFIC_VSWITCHD_START()
+
+AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"])
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
+ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
+
+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], 
[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+NETNS_DAEMONIZE([at_ns1], [[$PYTHON $srcdir/test-l7.py]], [http0.pid])
+NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 3 -T 1 --retry-connrefused -v -o 
wget0.log])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_SETUP([datapath - ping between two ports on vlan])
 OVS_TRAFFIC_VSWITCHD_START()
 
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v2 09/15] dpif-netdev: Execute conntrack action.

2016-04-15 Thread Daniele Di Proietto
This commit implements the OVS_ACTION_ATTR_CT action in dpif-netdev.

To allow ofproto-dpif to detect the conntrack feature, flow_put will not
discard anymore flows with ct_* fields set. We still shouldn't allow
flows with NAT bits set, since there is no support for NAT.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 68 +--
 tests/dpif-netdev.at  | 14 +--
 tests/ofproto-dpif.at | 20 +++
 3 files changed, 77 insertions(+), 25 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 1e8a37c..436359a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -33,6 +33,7 @@
 
 #include "bitmap.h"
 #include "cmap.h"
+#include "conntrack.h"
 #include "coverage.h"
 #include "csum.h"
 #include "dp-packet.h"
@@ -89,9 +90,17 @@ static struct shash dp_netdevs 
OVS_GUARDED_BY(dp_netdev_mutex)
 
 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
 
+#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
+ | CS_INVALID | CS_REPLY_DIR | CS_TRACKED)
+#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
+
 static struct odp_support dp_netdev_support = {
 .max_mpls_depth = SIZE_MAX,
 .recirc = true,
+.ct_state = true,
+.ct_zone = true,
+.ct_mark = true,
+.ct_label = true,
 };
 
 /* Stores a miniflow with inline values */
@@ -224,6 +233,8 @@ struct dp_netdev {
 /* Cpu mask for pin of pmd threads. */
 char *pmd_cmask;
 uint64_t last_tnl_conf_seq;
+
+struct conntrack conntrack;
 };
 
 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
@@ -906,6 +917,8 @@ create_dp_netdev(const char *name, const struct dpif_class 
*class,
 dp->upcall_aux = NULL;
 dp->upcall_cb = NULL;
 
+conntrack_init(>conntrack);
+
 cmap_init(>poll_threads);
 ovs_mutex_init_recursive(>non_pmd_mutex);
 ovsthread_key_create(>per_pmd_key, NULL);
@@ -976,6 +989,8 @@ dp_netdev_free(struct dp_netdev *dp)
 ovs_mutex_destroy(>non_pmd_mutex);
 ovsthread_key_delete(dp->per_pmd_key);
 
+conntrack_destroy(>conntrack);
+
 ovs_mutex_lock(>port_mutex);
 CMAP_FOR_EACH (port, node, >ports) {
 /* PMD threads are destroyed here. do_del_port() cannot quiesce */
@@ -1965,9 +1980,7 @@ dpif_netdev_flow_from_nlattrs(const struct nlattr *key, 
uint32_t key_len,
 return EINVAL;
 }
 
-/* Userspace datapath doesn't support conntrack. */
-if (flow->ct_state || flow->ct_zone || flow->ct_mark
-|| !ovs_u128_is_zero(>ct_label)) {
+if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
 return EINVAL;
 }
 
@@ -2604,6 +2617,9 @@ dpif_netdev_run(struct dpif *dpif)
 ovs_mutex_unlock(>non_pmd_mutex);
 dp_netdev_pmd_unref(non_pmd);
 
+/* XXX: If workload is too heavy we could add a separate thread. */
+conntrack_run(>conntrack);
+
 tnl_neigh_cache_run();
 tnl_port_map_run();
 new_tnl_seq = seq_read(tnl_conf_seq);
@@ -3850,12 +3866,48 @@ dp_execute_cb(void *aux_, struct dp_packet **packets, 
int cnt,
 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
 break;
 
-case OVS_ACTION_ATTR_CT:
-/* If a flow with this action is slow-pathed, datapath assistance is
- * required to implement it. However, we don't support this action
- * in the userspace datapath. */
-VLOG_WARN("Cannot execute conntrack action in userspace.");
+case OVS_ACTION_ATTR_CT: {
+const struct nlattr *b;
+bool commit = false;
+unsigned int left;
+uint16_t zone = 0;
+const char *helper = NULL;
+const uint32_t *setmark = NULL;
+const struct ovs_key_ct_labels *setlabel = NULL;
+
+
+/* XXX parsing this everytime is expensive.  We should do like kernel
+ * does and create a structure. */
+NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a), nl_attr_get_size(a)) 
{
+enum ovs_ct_attr sub_type = nl_attr_type(b);
+
+switch(sub_type) {
+case OVS_CT_ATTR_COMMIT:
+commit = true;
+break;
+case OVS_CT_ATTR_ZONE:
+zone = nl_attr_get_u16(b);
+break;
+case OVS_CT_ATTR_HELPER:
+helper = nl_attr_get_string(b);
+break;
+case OVS_CT_ATTR_MARK:
+setmark = nl_attr_get(b);
+break;
+case OVS_CT_ATTR_LABELS:
+setlabel = nl_attr_get(b);
+break;
+case OVS_CT_ATTR_NAT:
+case OVS_CT_ATTR_UNSPEC:
+case __OVS_CT_ATTR_MAX:
+OVS_NOT_REACHED();
+}
+

[ovs-dev] [PATCH v2 05/15] tests: Add very simple conntrack benchmark.

2016-04-15 Thread Daniele Di Proietto
This introduces a very limited but simple benchmark for
conntrack_execute(). It just sends repeatedly the same batch of packets
through the connection tracker and returns the time spent to process
them.

While this is not a realistic benchmark, it has proven useful during
development to evaluate different batching and locking strategies.

E.g. the line:

`./test/ovstest test-conntrack benchmark 1 1488 32`

starts 1 thread that will send 1488 packets to the connection
tracker, 32 at a time. It will print the time taken to process them.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 tests/automake.mk  |   1 +
 tests/test-conntrack.c | 167 +
 2 files changed, 168 insertions(+)
 create mode 100644 tests/test-conntrack.c

diff --git a/tests/automake.mk b/tests/automake.mk
index aed032b..9fbbd5a 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -310,6 +310,7 @@ tests_ovstest_SOURCES = \
tests/test-byte-order.c \
tests/test-classifier.c \
tests/test-cmap.c \
+   tests/test-conntrack.c \
tests/test-csum.c \
tests/test-flows.c \
tests/test-hash.c \
diff --git a/tests/test-conntrack.c b/tests/test-conntrack.c
new file mode 100644
index 000..414d7dc
--- /dev/null
+++ b/tests/test-conntrack.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include 
+#include "conntrack.h"
+
+#include "dp-packet.h"
+#include "fatal-signal.h"
+#include "flow.h"
+#include "netdev.h"
+#include "ovs-thread.h"
+#include "ovstest.h"
+#include "timeval.h"
+
+static const char payload[] = 
"5054000a505400090800451c00110a0101010a010102000100020008";
+
+static struct dp_packet **
+prepare_packets(size_t n, bool change, unsigned tid)
+{
+struct dp_packet **pkts = xcalloc(n, sizeof *pkts);
+struct flow flow;
+size_t i;
+
+for (i = 0; i < n; i++) {
+struct udp_header *udp;
+
+pkts[i] = dp_packet_new(sizeof payload/2);
+dp_packet_put_hex(pkts[i], payload, NULL);
+flow_extract(pkts[i], );
+
+udp = dp_packet_l4(pkts[i]);
+udp->udp_src = htons(ntohs(udp->udp_src) + tid);
+
+if (change) {
+udp->udp_dst = htons(ntohs(udp->udp_dst) + i);
+}
+}
+
+return pkts;
+}
+
+static void
+destroy_packets(struct dp_packet **pkts, size_t n)
+{
+size_t i;
+
+for (i = 0; i < n; i++) {
+dp_packet_delete(pkts[i]);
+}
+
+free(pkts);
+}
+
+struct thread_aux {
+pthread_t thread;
+unsigned tid;
+};
+
+static struct conntrack ct;
+static unsigned long n_threads, n_pkts, batch_size;
+static bool change_conn = false;
+static pthread_barrier_t barrier;
+
+static void *
+ct_thread_main(void *aux_)
+{
+struct thread_aux *aux = aux_;
+struct dp_packet **pkts;
+size_t i;
+
+pkts = prepare_packets(batch_size, change_conn, aux->tid);
+pthread_barrier_wait();
+for (i = 0; i < n_pkts; i += batch_size) {
+conntrack_execute(, pkts, batch_size, true, 0, NULL, NULL, NULL);
+}
+pthread_barrier_wait();
+destroy_packets(pkts, batch_size);
+
+return NULL;
+}
+
+static void
+test_benchmark(struct ovs_cmdl_context *ctx)
+{
+struct thread_aux *threads;
+long long start;
+unsigned i;
+
+fatal_signal_init();
+
+/* Parse arguments */
+n_threads = strtoul(ctx->argv[1], NULL, 0);
+if (!n_threads) {
+ovs_fatal(0, "n_threads must be at least one");
+}
+n_pkts = strtoul(ctx->argv[2], NULL, 0);
+batch_size = strtoul(ctx->argv[3], NULL, 0);
+if (batch_size == 0 || batch_size > NETDEV_MAX_BURST) {
+ovs_fatal(0, "batch_size must be between 1 and NETDEV_MAX_BURST(%u)",
+  NETDEV_MAX_BURST);
+}
+if (ctx->argc > 4) {
+change_conn = strtoul(ctx->argv[4], NULL, 0);
+}
+
+threads = xcalloc(n_threads, sizeof *threads);
+pthread_barrier_init(, NULL, n_threads + 1);
+conntrack_init();
+
+/* Create threads */
+for (i = 0; i < n_threads; i++) {
+threads[i].tid = i;
+threads[i].thread = ovs_thread_create("ct_thread", ct_thread_main, 
[i]);
+}
+/* Starts the work 

[ovs-dev] [PATCH v2 11/15] dpif-netdev: Implement conntrack flush interface.

2016-04-15 Thread Daniele Di Proietto
Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 5ac2bf3..995cbc0 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3991,6 +3991,14 @@ dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
 return err;
 }
 
+static int
+dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone)
+{
+struct dp_netdev *dp = get_dp_netdev(dpif);
+
+return conntrack_flush(>conntrack, zone);
+}
+
 const struct dpif_class dpif_netdev_class = {
 "netdev",
 dpif_netdev_init,
@@ -4034,7 +4042,7 @@ const struct dpif_class dpif_netdev_class = {
 dpif_netdev_ct_dump_start,
 dpif_netdev_ct_dump_next,
 dpif_netdev_ct_dump_done,
-NULL,   /* ct_flush */
+dpif_netdev_ct_flush,
 };
 
 static void
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v2 01/15] packets: Define ICMP types.

2016-04-15 Thread Daniele Di Proietto
Linux and FreeBSD have slightly different names for these constants.
Windows doesn't define them.  It is simpler to redefine them from
scratch for OVS.  The new names are different than those used in Linux
and FreeBSD.

These definitions will be used by a future commit.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/packets.h | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/lib/packets.h b/lib/packets.h
index 8139a6b..3ceff99 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -596,9 +596,21 @@ struct ip_header {
 ovs_16aligned_be32 ip_src;
 ovs_16aligned_be32 ip_dst;
 };
-
 BUILD_ASSERT_DECL(IP_HEADER_LEN == sizeof(struct ip_header));
 
+/* ICMPv4 types. */
+#define ICMP4_ECHO_REPLY 0
+#define ICMP4_DST_UNREACH 3
+#define ICMP4_SOURCEQUENCH 4
+#define ICMP4_REDIRECT 5
+#define ICMP4_ECHO_REQUEST 8
+#define ICMP4_TIME_EXCEEDED 11
+#define ICMP4_PARAM_PROB 12
+#define ICMP4_TIMESTAMP 13
+#define ICMP4_TIMESTAMPREPLY 14
+#define ICMP4_INFOREQUEST 15
+#define ICMP4_INFOREPLY 16
+
 #define ICMP_HEADER_LEN 8
 struct icmp_header {
 uint8_t icmp_type;
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v2 10/15] dpif-netdev: Implement conntrack dump functions.

2016-04-15 Thread Daniele Di Proietto
Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 60 ---
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 436359a..5ac2bf3 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -35,6 +35,7 @@
 #include "cmap.h"
 #include "conntrack.h"
 #include "coverage.h"
+#include "ct-dpif.h"
 #include "csum.h"
 #include "dp-packet.h"
 #include "dpif.h"
@@ -3937,6 +3938,59 @@ dp_netdev_execute_actions(struct dp_netdev_pmd_thread 
*pmd,
 actions_len, dp_execute_cb);
 }
 
+struct dp_netdev_ct_dump {
+struct ct_dpif_dump_state up;
+struct conntrack_dump dump;
+struct conntrack *ct;
+struct dp_netdev *dp;
+};
+
+static int
+dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
+  const uint16_t *pzone)
+{
+struct dp_netdev *dp = get_dp_netdev(dpif);
+struct dp_netdev_ct_dump *dump;
+
+dump = xzalloc(sizeof *dump);
+dump->dp = dp;
+dump->ct = >conntrack;
+
+conntrack_dump_start(>conntrack, >dump, pzone);
+
+*dump_ = >up;
+
+return 0;
+}
+
+static int
+dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
+ struct ct_dpif_dump_state *dump_,
+ struct ct_dpif_entry *entry)
+{
+struct dp_netdev_ct_dump *dump;
+
+INIT_CONTAINER(dump, dump_, up);
+
+return conntrack_dump_next(>dump, entry);
+}
+
+static int
+dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
+ struct ct_dpif_dump_state *dump_)
+{
+struct dp_netdev_ct_dump *dump;
+int err;
+
+INIT_CONTAINER(dump, dump_, up);
+
+err = conntrack_dump_done(>dump);
+
+free(dump);
+
+return err;
+}
+
 const struct dpif_class dpif_netdev_class = {
 "netdev",
 dpif_netdev_init,
@@ -3977,9 +4031,9 @@ const struct dpif_class dpif_netdev_class = {
 dpif_netdev_enable_upcall,
 dpif_netdev_disable_upcall,
 dpif_netdev_get_datapath_version,
-NULL,   /* ct_dump_start */
-NULL,   /* ct_dump_next */
-NULL,   /* ct_dump_done */
+dpif_netdev_ct_dump_start,
+dpif_netdev_ct_dump_next,
+dpif_netdev_ct_dump_done,
 NULL,   /* ct_flush */
 };
 
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v2 02/15] flow: Export parse_ipv6_ext_hdrs().

2016-04-15 Thread Daniele Di Proietto
This will be used by a future commit.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/flow.c | 140 ++---
 lib/flow.h |   3 ++
 2 files changed, 81 insertions(+), 62 deletions(-)

diff --git a/lib/flow.c b/lib/flow.c
index 560a90f..972a996 100644
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -439,6 +439,82 @@ invalid:
 arp_buf[1] = eth_addr_zero;
 }
 
+static inline bool
+parse_ipv6_ext_hdrs__(const void **datap, size_t *sizep, uint8_t *nw_proto,
+  uint8_t *nw_frag)
+{
+while (1) {
+if (OVS_LIKELY((*nw_proto != IPPROTO_HOPOPTS)
+   && (*nw_proto != IPPROTO_ROUTING)
+   && (*nw_proto != IPPROTO_DSTOPTS)
+   && (*nw_proto != IPPROTO_AH)
+   && (*nw_proto != IPPROTO_FRAGMENT))) {
+/* It's either a terminal header (e.g., TCP, UDP) or one we
+ * don't understand.  In either case, we're done with the
+ * packet, so use it to fill in 'nw_proto'. */
+return true;
+}
+
+/* We only verify that at least 8 bytes of the next header are
+ * available, but many of these headers are longer.  Ensure that
+ * accesses within the extension header are within those first 8
+ * bytes. All extension headers are required to be at least 8
+ * bytes. */
+if (OVS_UNLIKELY(*sizep < 8)) {
+return false;
+}
+
+if ((*nw_proto == IPPROTO_HOPOPTS)
+|| (*nw_proto == IPPROTO_ROUTING)
+|| (*nw_proto == IPPROTO_DSTOPTS)) {
+/* These headers, while different, have the fields we care
+ * about in the same location and with the same
+ * interpretation. */
+const struct ip6_ext *ext_hdr = *datap;
+*nw_proto = ext_hdr->ip6e_nxt;
+if (OVS_UNLIKELY(!data_try_pull(datap, sizep,
+(ext_hdr->ip6e_len + 1) * 8))) {
+return false;
+}
+} else if (*nw_proto == IPPROTO_AH) {
+/* A standard AH definition isn't available, but the fields
+ * we care about are in the same location as the generic
+ * option header--only the header length is calculated
+ * differently. */
+const struct ip6_ext *ext_hdr = *datap;
+*nw_proto = ext_hdr->ip6e_nxt;
+if (OVS_UNLIKELY(!data_try_pull(datap, sizep,
+(ext_hdr->ip6e_len + 2) * 4))) {
+return false;
+}
+} else if (*nw_proto == IPPROTO_FRAGMENT) {
+const struct ovs_16aligned_ip6_frag *frag_hdr = *datap;
+
+*nw_proto = frag_hdr->ip6f_nxt;
+if (!data_try_pull(datap, sizep, sizeof *frag_hdr)) {
+return false;
+}
+
+/* We only process the first fragment. */
+if (frag_hdr->ip6f_offlg != htons(0)) {
+*nw_frag = FLOW_NW_FRAG_ANY;
+if ((frag_hdr->ip6f_offlg & IP6F_OFF_MASK) != htons(0)) {
+*nw_frag |= FLOW_NW_FRAG_LATER;
+*nw_proto = IPPROTO_FRAGMENT;
+return true;
+}
+}
+}
+}
+}
+
+bool
+parse_ipv6_ext_hdrs(const void **datap, size_t *sizep, uint8_t *nw_proto,
+uint8_t *nw_frag)
+{
+return parse_ipv6_ext_hdrs__(datap, sizep, nw_proto, nw_frag);
+}
+
 /* Initializes 'flow' members from 'packet' and 'md'
  *
  * Initializes 'packet' header l2 pointer to the start of the Ethernet
@@ -641,68 +717,8 @@ miniflow_extract(struct dp_packet *packet, struct miniflow 
*dst)
 nw_ttl = nh->ip6_hlim;
 nw_proto = nh->ip6_nxt;
 
-while (1) {
-if (OVS_LIKELY((nw_proto != IPPROTO_HOPOPTS)
-   && (nw_proto != IPPROTO_ROUTING)
-   && (nw_proto != IPPROTO_DSTOPTS)
-   && (nw_proto != IPPROTO_AH)
-   && (nw_proto != IPPROTO_FRAGMENT))) {
-/* It's either a terminal header (e.g., TCP, UDP) or one we
- * don't understand.  In either case, we're done with the
- * packet, so use it to fill in 'nw_proto'. */
-break;
-}
-
-/* We only verify that at least 8 bytes of the next header are
- * available, but many of these headers are longer.  Ensure that
- * accesses within the extension header are within those first 8
- * bytes. All extension headers are required to be at least 8
- * bytes. */
-if (OVS_UNLIKELY(size < 8)) {
-goto out;
-}
-
-if ((n

[ovs-dev] [PATCH v2 03/15] flow: Introduce parse_dl_type().

2016-04-15 Thread Daniele Di Proietto
The function simply returns the ethernet type of the packet (after
eventually discarding the VLAN tag).  It will be used by a following
commit.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/flow.c | 14 --
 lib/flow.h |  1 +
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/lib/flow.c b/lib/flow.c
index 972a996..0250a7c 100644
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -327,7 +327,7 @@ parse_mpls(const void **datap, size_t *sizep)
 return MIN(count, FLOW_MAX_MPLS_LABELS);
 }
 
-static inline ovs_be16
+static inline ALWAYS_INLINE ovs_be16
 parse_vlan(const void **datap, size_t *sizep)
 {
 const struct eth_header *eth = *datap;
@@ -349,7 +349,7 @@ parse_vlan(const void **datap, size_t *sizep)
 return 0;
 }
 
-static inline ovs_be16
+static inline ALWAYS_INLINE ovs_be16
 parse_ethertype(const void **datap, size_t *sizep)
 {
 const struct llc_snap_header *llc;
@@ -826,6 +826,16 @@ miniflow_extract(struct dp_packet *packet, struct miniflow 
*dst)
 dst->map = mf.map;
 }
 
+ovs_be16
+parse_dl_type(const struct eth_header *data_, size_t size)
+{
+const void *data = data_;
+
+parse_vlan(, );
+
+return parse_ethertype(, );
+}
+
 /* For every bit of a field that is wildcarded in 'wildcards', sets the
  * corresponding bit in 'flow' to zero. */
 void
diff --git a/lib/flow.h b/lib/flow.h
index 6771232..0406198 100644
--- a/lib/flow.h
+++ b/lib/flow.h
@@ -107,6 +107,7 @@ void flow_compose(struct dp_packet *, const struct flow *);
 
 bool parse_ipv6_ext_hdrs(const void **datap, size_t *sizep, uint8_t *nw_proto,
  uint8_t *nw_frag);
+ovs_be16 parse_dl_type(const struct eth_header *data_, size_t size);
 
 static inline uint64_t
 flow_get_xreg(const struct flow *flow, int idx)
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v2 07/15] conntrack: Implement flush function.

2016-04-15 Thread Daniele Di Proietto
Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/conntrack.c | 21 +
 lib/conntrack.h |  2 ++
 2 files changed, 23 insertions(+)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 840335b..7913e76 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -849,3 +849,24 @@ delete_conn(struct conn *conn)
 {
 free(conn);
 }
+
+int
+conntrack_flush(struct conntrack *ct, const uint16_t *zone)
+{
+unsigned i;
+
+for (i = 0; i < CONNTRACK_BUCKETS; i++) {
+struct conn *conn, *next;
+
+ct_lock_lock(>locks[i]);
+HMAP_FOR_EACH_SAFE(conn, next, node, >connections[i]) {
+if (!zone || *zone == conn->key.zone) {
+hmap_remove(>connections[i], >node);
+delete_conn(conn);
+}
+}
+ct_lock_unlock(>locks[i]);
+}
+
+return 0;
+}
diff --git a/lib/conntrack.h b/lib/conntrack.h
index 8561273..7e8b604 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -68,6 +68,8 @@ int conntrack_execute(struct conntrack *, struct dp_packet 
**, size_t,
   bool commit, uint16_t zone, const uint32_t *setmark,
   const struct ovs_key_ct_labels *setlabel,
   const char *helper);
+
+int conntrack_flush(struct conntrack *, const uint16_t *zone);
 
 /* struct ct_lock is a standard mutex or a spinlock when using DPDK */
 
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v2 00/15] Userspace (DPDK) connection tracker

2016-04-15 Thread Daniele Di Proietto
This series aims to implement the ct() action for the dpif-netdev datapath.
The bulk of the code is in the new conntrack module: it contains some packet
parsing code, some lookup tables and the logic to implements all the ct bits.

The conntrack module is helped by conntrack-tcp, for TCP window and flags
tracking: the bulk of the code of this submodule is from the FreeBSD's pf
subsystem, therefore is BSD licensed.

The rest of the series integrates the connection tracker with the rest of
OVS: the ct() action is implemented in dpif-netdev, and the debugging
interfaces required by dpctl/{dump,flush}-conntrack are implemented.

Besides adding some unit tests, this series ports the existing conntrack
system test to the userspace datapath.  Some small modifications are
required to pass the testsuite, and some tests still have to be skipped.

On newer kernels the userspace testsuite has some problems with offloads,
so a workaround is included.

This can also be downloaded at:

https://github.com/ddiproietto/ovs/tree/userconntrack_20160415

Any feedback is appreciated, thanks.

v1 -> v2:
* Fixed bug in tcp_get_wscale(), related to TCP options parsing.
* Changed names of ICMP constants: now they're different from Linux and
  FreeBSD.
* Fixed bug in parse_ipv6_ext_hdrs().
* Used ALWAYS_INLINE in parse_vlan and parse_ethertype, to avoid a
  performance regression in miniflow_extract().
* Updated copyright info in COPYING and debian/copyright.in.
* Rebased.
* Changed batching strategy in conntrack_execute() to allow a newly
  created connection to be picked up by packets in the same batch.
* Added an ovs-test module to throw pcap files at the connection tracker.
* Added a workaround for the userspace testsuite on new kernels and a tcp
  non-conntrack test.

Daniele Di Proietto (15):
  packets: Define ICMP types.
  flow: Export parse_ipv6_ext_hdrs().
  flow: Introduce parse_dl_type().
  conntrack: New userspace connection tracker.
  tests: Add very simple conntrack benchmark.
  tests: Add test-conntrack pcap test.
  conntrack: Implement flush function.
  conntrack: Implement dumping to ct_entry.
  dpif-netdev: Execute conntrack action.
  dpif-netdev: Implement conntrack dump functions.
  dpif-netdev: Implement conntrack flush interface.
  tests: Add conntrack ofproto-dpif tests.
  system-tests: Disable offloads in userspace tests.
  system-tests: Add tcp simple test.
  system-tests: Run conntrack tests with userspace

 COPYING  |   1 +
 debian/copyright.in  |   4 +
 lib/automake.mk  |   5 +
 lib/conntrack-other.c|  91 
 lib/conntrack-private.h  |  80 
 lib/conntrack-tcp.c  | 510 
 lib/conntrack.c  | 997 +++
 lib/conntrack.h  | 162 +++
 lib/dpif-netdev.c| 138 +-
 lib/flow.c   | 154 +++---
 lib/flow.h   |   4 +
 lib/packets.h|  14 +-
 tests/automake.mk|   1 +
 tests/dpif-netdev.at |  14 +-
 tests/ofproto-dpif.at| 698 ++-
 tests/system-common-macros.at|   1 +
 tests/system-kmod-macros.at  |  35 ++
 tests/system-traffic.at  |  69 ++-
 tests/system-userspace-macros.at |  63 ++-
 tests/test-conntrack.c   | 232 +
 20 files changed, 3164 insertions(+), 109 deletions(-)
 create mode 100644 lib/conntrack-other.c
 create mode 100644 lib/conntrack-private.h
 create mode 100644 lib/conntrack-tcp.c
 create mode 100644 lib/conntrack.c
 create mode 100644 lib/conntrack.h
 create mode 100644 tests/test-conntrack.c

-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v7 16/16] netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

2016-04-15 Thread Daniele Di Proietto


On 14/04/2016 05:37, "Kavanagh, Mark B" <mark.b.kavan...@intel.com> wrote:

>Hi Daniele,
>
>One comment inline.
>
>Thanks,
>Mark
>
>>
>>This introduces in dpif-netdev and netdev-dpdk the first use for the
>>newly introduce reconfigure netdev call.
>>
>>When a request to change the number of queues comes, netdev-dpdk will
>>remember this and notify the upper layer via
>>netdev_request_reconfigure().
>>
>>The datapath, instead of periodically calling netdev_set_multiq(), can
>>detect this and call reconfigure().
>>
>>This mechanism can also be used to:
>>* Automatically match the number of rxq with the one provided by qemu
>>  via the new_device callback.
>>* Provide a way to change the MTU of dpdk devices at runtime.
>>* Move a DPDK vhost device to the proper NUMA socket.
>>
>>Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
>>---
>> lib/dpif-netdev.c |  69 +-
>> lib/netdev-dpdk.c | 195
>>++
>> lib/netdev-provider.h |  23 +++---
>> lib/netdev.c  |  34 +++--
>> lib/netdev.h  |   3 +-
>> 5 files changed, 155 insertions(+), 169 deletions(-)

[...]

>>@@ -312,12 +305,12 @@ struct netdev_class {
>>  * making sure that these concurrent calls do not create a race
>>condition
>>  * by using multiple hw queues or locking.
>>  *
>>- * On error, the tx queue and rx queue configuration is
>>indeterminant.
>>- * Caller should make decision on whether to restore the previous or
>>- * the default configuration.  Also, caller must make sure there is
>>no
>>- * other thread accessing the queues at the same time. */
>>-int (*set_multiq)(struct netdev *netdev, unsigned int n_txq,
>>-  unsigned int n_rxq);
>>+ * The caller will call netdev_reconfigure() (if necessary) before
>>using
>>+ * netdev_send() on any of the newly configured queues, giving the
>>provider
>>+ * a chance to adjust its settings.
>>+ *
>>+ * On error, the tx queue configuration is unchanged. */
>>+int (*set_multiq)(struct netdev *netdev, unsigned int n_txq);
>
>Since this function now deals only with TX queues, an identifier along
>the lines of 'set_tx_multiq' might more accurately describe its
>functionality. Specific netdev classes would need to modify the names of
>their own specific 'set_multiq' functions accordingly.

You're right, 'set_tx_multiq()' is definitely a better name for it. I
updated it.

Thanks for all your feedback

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v7 11/16] ovs-thread: Do not quiesce in ovs_mutex_cond_wait().

2016-04-15 Thread Daniele Di Proietto
Hi Mark,

On 14/04/2016 05:36, "Kavanagh, Mark B" <mark.b.kavan...@intel.com> wrote:

>Hi Daniele,
>
>One comment inline.
>
>Cheers,
>Mark
>
>>
>>ovs_mutex_cond_wait() is used in many functions in dpif-netdev to
>>synchronize with pmd threads, but we can't guarantee that the callers do
>>not hold RCU references, so it's better to avoid quiescing.
>
>You'll need to update the following comment in lib/rcu.h accordingly:
>
>
> For example, poll_block() includes a quiescent state, as does
>ovs_mutex_cond_wait().
>

You're right, I removed the reference to ovs_mutex_cond_wait() there.

Thanks for noticing this!

>
>>
>>In system_stats_thread_func() the code relied on ovs_mutex_cond_wait()
>>to introduce a quiescent state, so explicit calls to
>>ovsrcu_quiesce_start() and ovsrcu_quiesce_end() are added there.
>>
>>Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
>>---
>> lib/ovs-thread.c| 2 --
>> vswitchd/system-stats.c | 6 ++
>> 2 files changed, 6 insertions(+), 2 deletions(-)
>>
>>diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c
>>index 3c065cf..26dd928 100644
>>--- a/lib/ovs-thread.c
>>+++ b/lib/ovs-thread.c
>>@@ -253,9 +253,7 @@ ovs_mutex_cond_wait(pthread_cond_t *cond, const
>>struct ovs_mutex *mutex_)
>> struct ovs_mutex *mutex = CONST_CAST(struct ovs_mutex *, mutex_);
>> int error;
>>
>>-ovsrcu_quiesce_start();
>> error = pthread_cond_wait(cond, >lock);
>>-ovsrcu_quiesce_end();
>>
>> if (OVS_UNLIKELY(error)) {
>> ovs_abort(error, "pthread_cond_wait failed");
>>diff --git a/vswitchd/system-stats.c b/vswitchd/system-stats.c
>>index df4971e..129f0cf 100644
>>--- a/vswitchd/system-stats.c
>>+++ b/vswitchd/system-stats.c
>>@@ -37,6 +37,7 @@
>> #include "json.h"
>> #include "latch.h"
>> #include "openvswitch/ofpbuf.h"
>>+#include "ovs-rcu.h"
>> #include "ovs-thread.h"
>> #include "poll-loop.h"
>> #include "shash.h"
>>@@ -615,7 +616,12 @@ system_stats_thread_func(void *arg OVS_UNUSED)
>>
>> ovs_mutex_lock();
>> while (!enabled) {
>>+/* The thread is sleeping, potentially for a long time, and
>>it's
>>+ * not holding RCU protected references, so it makes sense
>>to
>>+ * quiesce */
>>+ovsrcu_quiesce_start();
>> ovs_mutex_cond_wait(, );
>>+ovsrcu_quiesce_end();
>> }
>> ovs_mutex_unlock();
>>
>>--
>>2.1.4
>

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v7 09/16] hmap: Use struct for hmap_at_position().

2016-04-15 Thread Daniele Di Proietto
Hi Mark,

On 14/04/2016 05:36, "Kavanagh, Mark B" <mark.b.kavan...@intel.com> wrote:

>Hi Daniele,
>
>One minor comment inline.
>
>Cheers,
>Mark
>
>>
>>The interface will be more similar to the cmap.
>>
>>Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
>>---
>> lib/hmap.c | 26 --
>> lib/hmap.h |  7 ++-
>> lib/sset.c | 12 +---
>> lib/sset.h |  7 ++-
>> ofproto/ofproto-dpif.c |  8 +++-
>> 5 files changed, 32 insertions(+), 28 deletions(-)
>>
>>diff --git a/lib/hmap.c b/lib/hmap.c
>>index b70ce51..9462c5e 100644
>>--- a/lib/hmap.c
>>+++ b/lib/hmap.c
>>@@ -236,24 +236,22 @@ hmap_random_node(const struct hmap *hmap)
>> }
>>
>> /* Returns the next node in 'hmap' in hash order, or NULL if no nodes
>>remain in
>>- * 'hmap'.  Uses '*bucketp' and '*offsetp' to determine where to begin
>>- * iteration, and stores new values to pass on the next iteration into
>>them
>>- * before returning.
>>+ * 'hmap'.  Uses '*pos' to determine where to begin iteration, and
>>updates
>>+ * '*pos' to pass on the next iteration into them before returning.
>>  *
>>  * It's better to use plain HMAP_FOR_EACH and related functions, since
>>they are
>>  * faster and better at dealing with hmaps that change during iteration.
>>  *
>>- * Before beginning iteration, store 0 into '*bucketp' and '*offsetp'.
>>- */
>>+ * Before beginning iteration, set '*pos' to all zeros. */
>> struct hmap_node *
>> hmap_at_position(const struct hmap *hmap,
>>- uint32_t *bucketp, uint32_t *offsetp)
>>+ struct hmap_position *pos)
>> {
>> size_t offset;
>> size_t b_idx;
>>
>>-offset = *offsetp;
>>-for (b_idx = *bucketp; b_idx <= hmap->mask; b_idx++) {
>>+offset = pos->offset;
>>+for (b_idx = pos->bucket; b_idx <= hmap->mask; b_idx++) {
>> struct hmap_node *node;
>> size_t n_idx;
>>
>>@@ -261,11 +259,11 @@ hmap_at_position(const struct hmap *hmap,
>>  n_idx++, node = node->next) {
>> if (n_idx == offset) {
>> if (node->next) {
>>-*bucketp = node->hash & hmap->mask;
>>-*offsetp = offset + 1;
>>+pos->bucket = node->hash & hmap->mask;
>>+pos->offset = offset + 1;
>> } else {
>>-*bucketp = (node->hash & hmap->mask) + 1;
>>-*offsetp = 0;
>>+pos->bucket = (node->hash & hmap->mask) + 1;
>>+pos->offset = 0;
>> }
>> return node;
>> }
>>@@ -273,8 +271,8 @@ hmap_at_position(const struct hmap *hmap,
>> offset = 0;
>> }
>>
>>-*bucketp = 0;
>>-*offsetp = 0;
>>+pos->bucket = 0;
>>+pos->offset = 0;
>> return NULL;
>> }
>>
>>diff --git a/lib/hmap.h b/lib/hmap.h
>>index 08c4719..9a96c5f 100644
>>--- a/lib/hmap.h
>>+++ b/lib/hmap.h
>>@@ -201,8 +201,13 @@ static inline struct hmap_node *hmap_first(const
>>struct hmap *);
>> static inline struct hmap_node *hmap_next(const struct hmap *,
>>   const struct hmap_node *);
>>
>>+struct hmap_position {
>>+unsigned int bucket;
>>+unsigned int offset;
>>+};
>>+
>> struct hmap_node *hmap_at_position(const struct hmap *,
>>-   uint32_t *bucket, uint32_t *offset);
>>+   struct hmap_position *);
>>
>> /* Returns the number of nodes currently in 'hmap'. */
>> static inline size_t
>>diff --git a/lib/sset.c b/lib/sset.c
>>index f9d4fc0..4fd3fae 100644
>>--- a/lib/sset.c
>>+++ b/lib/sset.c
>>@@ -251,21 +251,19 @@ sset_equals(const struct sset *a, const struct
>>sset *b)
>> }
>>
>> /* Returns the next node in 'set' in hash order, or NULL if no nodes
>>remain in
>>- * 'set'.  Uses '*bucketp' and '*offsetp' to determine where to begin
>>- * iteration, and stores new values to pass on the next iteration into
>>them
>>- * before returning.
>>+ * 'set'.  Uses '*pos' to determine where to begin iteration, and
>>updates
>>+ * '*pos' to pass on the next iteration into them before returning.

Re: [ovs-dev] [PATCH v3] Update relevant artifacts to add support for DPDK 16.04.

2016-04-15 Thread Daniele Di Proietto
Thanks for the patch! I pushed this to master

2016-04-14 9:40 GMT-07:00 mweglicx :

> Following changes are applied:
>  - INSTALL.DPDK.md: CONFIG_RTE_BUILD_COMBINE_LIBS step has been
>removed because it is no longer present in DPDK configuration
>(combined library is created by default),
>  - INSTALL.DPDK.md: VHost Cuse configuration is updated,
>  - netdev-dpdk.c: Link speed definition is changed in DPDK and
>netdev_dpdk_get_features is updated accordingly,
>  - netdev-dpdk.c: TSO and checksum offload has been disabled for
>vhostuser device.
>  - .travis/linux-build.sh: DPDK version is updated and legacy
>flags have been removed in configuration.
>
> Signed-off-by: Michal Weglicki 
> Signed-off-by: Panu Matilainen 
>
> v1->v2
>  - link autonegotiation check is corrected.
> v2->v3
>  - TSO & checksum offload is disabled by default for VHOST user.
>  - .travis/linux-build.sh fPIC flag insertion is corrected.
> ---
>  .travis/linux-build.sh |  3 +--
>  INSTALL.DPDK.md| 21 -
>  lib/netdev-dpdk.c  | 28 
>  3 files changed, 25 insertions(+), 27 deletions(-)
>
> diff --git a/.travis/linux-build.sh b/.travis/linux-build.sh
> index ef712d0..065de39 100755
> --- a/.travis/linux-build.sh
> +++ b/.travis/linux-build.sh
> @@ -49,7 +49,6 @@ function install_dpdk()
>  cd dpdk-$1
>  fi
>  find ./ -type f | xargs sed -i
> 's/max-inline-insns-single=100/max-inline-insns-single=400/'
> -sed -ri 's,(CONFIG_RTE_BUILD_COMBINE_LIBS=).*,\1y,'
> config/common_linuxapp
>  echo 'CONFIG_RTE_BUILD_FPIC=y' >>config/common_linuxapp
>  sed -ri '/EXECENV_CFLAGS  = -pthread -fPIC/{s/$/\nelse ifeq
> ($(CONFIG_RTE_BUILD_FPIC),y)/;s/$/\nEXECENV_CFLAGS  = -pthread -fPIC/}'
> mk/exec-env/linuxapp/rte.vars.mk
>  make config CC=gcc T=x86_64-native-linuxapp-gcc
> @@ -69,7 +68,7 @@ fi
>
>  if [ "$DPDK" ]; then
>  if [ -z "$DPDK_VER" ]; then
> -DPDK_VER="2.2.0"
> +DPDK_VER="16.04"
>  fi
>  install_dpdk $DPDK_VER
>  if [ "$CC" = "clang" ]; then
> diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
> index 9ec8bf6..7f76df8 100644
> --- a/INSTALL.DPDK.md
> +++ b/INSTALL.DPDK.md
> @@ -16,7 +16,7 @@ OVS needs a system with 1GB hugepages support.
>  Building and Installing:
>  
>
> -Required: DPDK 2.2
> +Required: DPDK 16.04
>  Optional (if building with vhost-cuse): `fuse`, `fuse-devel`
> (`libfuse-dev`
>  on Debian/Ubuntu)
>
> @@ -24,16 +24,11 @@ on Debian/Ubuntu)
>1. Set `$DPDK_DIR`
>
>   ```
> - export DPDK_DIR=/usr/src/dpdk-2.2
> + export DPDK_DIR=/usr/src/dpdk-16.04
>   cd $DPDK_DIR
>   ```
>
> -  2. Update `config/common_linuxapp` so that DPDK generate single lib
> file.
> - (modification also required for IVSHMEM build)
> -
> - `CONFIG_RTE_BUILD_COMBINE_LIBS=y`
> -
> - Then run `make install` to build and install the library.
> +  2. Then run `make install` to build and install the library.
>   For default install without IVSHMEM:
>
>   `make install T=x86_64-native-linuxapp-gcc DESTDIR=install`
> @@ -496,7 +491,7 @@ the vswitchd.
>  DPDK vhost:
>  ---
>
> -DPDK 2.2 supports two types of vhost:
> +DPDK 16.04 supports two types of vhost:
>
>  1. vhost-user
>  2. vhost-cuse
> @@ -517,7 +512,7 @@ with OVS.
>  DPDK vhost-user Prerequisites:
>  -
>
> -1. DPDK 2.2 with vhost support enabled as documented in the "Building and
> +1. DPDK 16.04 with vhost support enabled as documented in the "Building
> and
> Installing section"
>
>  2. QEMU version v2.1.0+
> @@ -635,10 +630,10 @@ with OVS.
>  DPDK vhost-cuse Prerequisites:
>  -
>
> -1. DPDK 2.2 with vhost support enabled as documented in the "Building and
> +1. DPDK 16.04 with vhost support enabled as documented in the "Building
> and
> Installing section"
> As an additional step, you must enable vhost-cuse in DPDK by setting
> the
> -   following additional flag in `config/common_linuxapp`:
> +   following additional flag in `config/common_base`:
>
> `CONFIG_RTE_LIBRTE_VHOST_USER=n`
>
> @@ -938,7 +933,7 @@ Restrictions:
>  this with smaller page sizes.
>
>Platform and Network Interface:
> -  - By default with DPDK 2.2, a maximum of 64 TX queues can be used with
> an
> +  - By default with DPDK 16.04, a maximum of 64 TX queues can be used
> with an
>  Intel XL710 Network Interface on a platform with more than 64 logical
>  cores. If a user attempts to add an XL710 interface as a DPDK port
> type to
>  a system as described above, an error will be reported that
> initialization
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index e09b471..f8d010e 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -1740,35 +1740,35 @@ netdev_dpdk_get_features(const struct netdev
> *netdev_,
>  link = dev->link;
>  

Re: [ovs-dev] [PATCH v5] acinclude: Autodetect DPDK location when configuring OVS

2016-04-15 Thread Daniele Di Proietto


On 14/04/2016 14:51, "Ben Pfaff"  wrote:

>On Tue, Apr 12, 2016 at 11:44:15AM +0100, Bhanuprakash Bodireddy wrote:
>> When using DPDK datapath, the OVS configure script requires the DPDK
>> build directory passed on --with-dpdk. This can be avoided if DPDK
>> library, headers are in standard compiler search paths.
>> 
>> This patch fixes the problem by searching for DPDK libraries in standard
>> locations and configure OVS sources for dpdk datapath.
>> 
>> If the install location is manually specified in "--with-dpdk"
>> autodiscovery shall be skipped.
>> 
>> v4->v5
>> - Minor code fixes and indentation changes as suggested by Ben
>> 
>> Signed-off-by: Bhanuprakash Bodireddy 
>
>Daniele, do you want to review this and apply it if you're satisfied?
>
>Thanks,
>
>Ben.

Looks good to me, I added your name to AUTHORS and applied this to master.

Thanks!

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v6 06/12] dpif-netdev: Wait an RCU grace period before freeing ports.

2016-04-13 Thread Daniele Di Proietto


On 10/04/2016 12:23, "Ben Pfaff" <b...@ovn.org> wrote:

>On Fri, Apr 08, 2016 at 03:12:59AM +, Daniele Di Proietto wrote:
>> 
>> 
>> On 01/04/2016 09:52, "Jarno Rajahalme" <ja...@ovn.org> wrote:
>> 
>> >
>> >> On Mar 30, 2016, at 8:08 PM, Daniele Di Proietto
>> >><diproiet...@vmware.com> wrote:
>> >> 
>> >> 
>> >> On 30/03/2016 16:01, "Ben Pfaff" <b...@ovn.org> wrote:
>> >> 
>> >>> (I'm taking a look at this patch specifically because Daniele asked
>>me;
>> >>> I'm not planning to review the whole series.)
>> >>> 
>> >>> On Mon, Mar 28, 2016 at 12:41:40PM -0700, Daniele Di Proietto wrote:
>> >>>> The dpif-netdev datapath keeps ports in a cmap which is written
>>only
>> >>>>by
>> >>>> the main thread (holding port_mutex), but which is read
>>concurrently
>> >>>>by
>> >>>> many threads (most notably the pmd threads).
>> >>>> 
>> >>>> When removing ports from the datapath we should postpone the
>>deletion,
>> >>>> otherwise another thread might access invalid memory while reading
>>the
>> >>>> cmap.
>> >>>> 
>> >>>> This commit splits do_port_del() in do_port_remove() and
>> >>>> do_port_destroy(): the former removes the port from the cmap, while
>> >>>>the
>> >>>> latter reclaims the memory and drops the reference to the
>>underlying
>> >>>> netdev.
>> >>> 
>> >>> s/del_port/port_del/ here:
>> >> 
>> >> Thanks, changed
>> >> 
>> >>> 
>> >>>> dpif_netdev_del_port() now uses ovsrcu_synchronize() before calling
>> >>>> do_port_destroy(), to avoid memory corruption in concurrent
>>readers.
>> >>> 
>> >>> ovsrcu_synchronize() requires that nothing in the thread that calls
>>it
>> >>> is relying on RCU to keep objects around.  That means that no
>>caller of
>> >>> dfpi_port_del()--there are a few of them--can rely on it.  This is
>> >>> usually a risky assumption, especially because this assumption can
>> >>> change later.  Is there reason to believe that it isn't important in
>> >>>all
>> >>> of these cases?
>> >> 
>> >> I agree that's risky, but I think it's the only way to keep the ports
>> >>RCU
>> >> protected, because a port needs to be effectively deleted before
>> >> dpif_netdev_port_del() can return.
>> >> 
>> >
>> >If this is because otherwise a following port_add can fail, as the old
>> >port is still around, maybe we could make the highest possible level of
>> >port_add detect the failure and then rcu_synchronize and try again?
>>Would
>> >that work?
>> >
>> >  Jarno
>> 
>> After some thought I decided to avoid using RCU for ports. I'll send an
>> updated
>> series soon.
>
>Well, that makes the discussion a little easier ;-)  Thanks.
>
>Do you want to me to review anything in the new version?

I was only concerned about the use of RCU. The new version should be
simpler.

Thanks!

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH] Update relevant artifacts to add support for DPDK 16.04.

2016-04-13 Thread Daniele Di Proietto
2016-04-13 9:21 GMT-07:00 Traynor, Kevin :

> > -Original Message-
> > From: dev [mailto:dev-boun...@openvswitch.org] On Behalf Of Panu
> Matilainen
> > Sent: Wednesday, April 13, 2016 8:50 AM
> > To: Weglicki, MichalX ; dev@openvswitch.org
> > Subject: Re: [ovs-dev] [PATCH] Update relevant artifacts to add support
> for
> > DPDK 16.04.
> >
>
> [snip]
>
> > As an aside, I've been thinking maybe this is a case where OVS could
> > support both DPDK 2.2 and 16.04. I know its unprecedented but maybe that
> > could change, restricting OVS to just one DPDK version seems
> > unnecessarily strict when talking about differences this trivial.
>
> Judging by the ML, it's more commonly requested to use the current release
> of DPDK with the last release of OVS e.g. OVS 2.5 and DPDK 16.04, than
> people
> wanting OVS master with DPDK X-1.
>
> Even for a trivial case like above - it would be ok now to add support for
> DPDK X-1
> but if we then add OVS code to take advantage of new DPDK X features (e.g.
> vhost
> pmd) we'll end up with messy code. Also testing efforts would increase
> (double?),
> so I don't think we would get it without a cost.
>

I agree, I'd still prefer to support a single version


>
> Kevin.
>
> >
> >   - Panu -
> > ___
> > dev mailing list
> > dev@openvswitch.org
> > http://openvswitch.org/mailman/listinfo/dev
> ___
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
>
___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v2] Update relevant artifacts to add support for DPDK 16.04.

2016-04-13 Thread Daniele Di Proietto
Thanks for the patch, I have a couple of comments:

DPDK 16.04 enables by default checksum offloads and TSO for vhostuser
device. While this seem to work ok, there seem to be a few problems with
this:
* OVS in userspace assumes that a packet is stored using a single mbuf
(it is not aware of the 'next' member or of the difference between
'data_len' and 'pkt_len')
* Most of the code in the userspace datapath is unaware of checksum
offloads.  We can easily lose the offload info and send a packet with the
wrong checksum to netdev-linux, for example.

Fixing this requires some extra work on the OVS side, which we should do
eventually, but for the moment I think we should disable offloads with
something like this:

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index e6aac8f..8ecd85b 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2240,6 +2240,9 @@ static int
 dpdk_vhost_class_init(void)
 {
 rte_vhost_driver_callback_register(_net_device_ops);
+rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
+  | 1ULL << VIRTIO_NET_F_HOST_TSO6
+  | 1ULL << VIRTIO_NET_F_CSUM);
 ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
 return 0;
 }

One more comment inline, otherwise I'm happy with the patch

2016-04-13 2:41 GMT-07:00 mweglicx :

> Following changes are applied:
>  - INSTALL.DPDK.md: CONFIG_RTE_BUILD_COMBINE_LIBS step has been
>removed because it is no longer present in DPDK configuration
>(combined library is created by default),
>  - INSTALL.DPDK.md: VHost Cuse configuration is updated,
>  - netdev-dpdk.c: Link speed definition is changed in DPDK and
>netdev_dpdk_get_features is updated accordingly,
>  - .travis/linux-build.sh: DPDK version is updated and legacy
>flags have been removed in configuration.
>
> Signed-off-by: Michal Weglicki 
> Signed-off-by: Panu Matilainen 
>
> v1->v2
>  - link autonegotiation check is corrected.
> ---
>  .travis/linux-build.sh |  5 +
>  INSTALL.DPDK.md| 23 +--
>  lib/netdev-dpdk.c  | 24 
>  3 files changed, 22 insertions(+), 30 deletions(-)
>
> diff --git a/.travis/linux-build.sh b/.travis/linux-build.sh
> index ef712d0..a3c8e6e 100755
> --- a/.travis/linux-build.sh
> +++ b/.travis/linux-build.sh
> @@ -49,9 +49,6 @@ function install_dpdk()
>  cd dpdk-$1
>  fi
>  find ./ -type f | xargs sed -i
> 's/max-inline-insns-single=100/max-inline-insns-single=400/'
> -sed -ri 's,(CONFIG_RTE_BUILD_COMBINE_LIBS=).*,\1y,'
> config/common_linuxapp
> -echo 'CONFIG_RTE_BUILD_FPIC=y' >>config/common_linuxapp
> -sed -ri '/EXECENV_CFLAGS  = -pthread -fPIC/{s/$/\nelse ifeq
> ($(CONFIG_RTE_BUILD_FPIC),y)/;s/$/\nEXECENV_CFLAGS  = -pthread -fPIC/}'
> mk/exec-env/linuxapp/rte.vars.mk
>

I'm really glad that we can get rid of the COMBINE_LIBS configuration
option.
I think the other two lines are still needed, because (for testing
purposes) we want to link DPDK into libopenvswitch.so.


>  make config CC=gcc T=x86_64-native-linuxapp-gcc
>  make CC=gcc RTE_KERNELDIR=$KERNELSRC
>  echo "Installed DPDK source in $(pwd)"
> @@ -69,7 +66,7 @@ fi
>
>  if [ "$DPDK" ]; then
>  if [ -z "$DPDK_VER" ]; then
> -DPDK_VER="2.2.0"
> +DPDK_VER="16.04"
>  fi
>  install_dpdk $DPDK_VER
>  if [ "$CC" = "clang" ]; then
> diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
> index 9ec8bf6..8c8cd4c 100644
> --- a/INSTALL.DPDK.md
> +++ b/INSTALL.DPDK.md
> @@ -16,7 +16,7 @@ OVS needs a system with 1GB hugepages support.
>  Building and Installing:
>  
>
> -Required: DPDK 2.2
> +Required: DPDK 16.04
>  Optional (if building with vhost-cuse): `fuse`, `fuse-devel`
> (`libfuse-dev`
>  on Debian/Ubuntu)
>
> @@ -24,16 +24,11 @@ on Debian/Ubuntu)
>1. Set `$DPDK_DIR`
>
>   ```
> - export DPDK_DIR=/usr/src/dpdk-2.2
> + export DPDK_DIR=/usr/src/dpdk-16.04
>   cd $DPDK_DIR
>   ```
>
> -  2. Update `config/common_linuxapp` so that DPDK generate single lib
> file.
> - (modification also required for IVSHMEM build)
> -
> - `CONFIG_RTE_BUILD_COMBINE_LIBS=y`
> -
> - Then run `make install` to build and install the library.
> +  2. Then run `make install` to build and install the library.
>   For default install without IVSHMEM:
>
>   `make install T=x86_64-native-linuxapp-gcc DESTDIR=install`
> @@ -81,7 +76,7 @@ Using the DPDK with ovs-vswitchd:
>
>  1. Setup system boot
> Add the following options to the kernel bootline:
> -
> +
> `default_hugepagesz=1GB hugepagesz=1G hugepages=1`
>
>  2. Setup DPDK devices:
> @@ -496,7 +491,7 @@ the vswitchd.
>  DPDK vhost:
>  ---
>
> -DPDK 2.2 supports two types of vhost:
> +DPDK 16.04 supports two types of vhost:
>
>  1. vhost-user
>  2. vhost-cuse
> @@ -517,7 +512,7 @@ with OVS.
>  DPDK vhost-user 

Re: [ovs-dev] [PATCH RFC 1/1] netdev-dpdk.c: Add ingress-policing functionality.

2016-04-07 Thread Daniele Di Proietto
Hi Ian,

On 07/04/2016 06:00, "Stokes, Ian"  wrote:

>> > >71034a0..faf3583 100644
>> > >--- a/lib/netdev-dpdk.c
>> > >+++ b/lib/netdev-dpdk.c
>> > >@@ -53,6 +53,7 @@
>> > >
>> > > #include "rte_config.h"
>> > > #include "rte_mbuf.h"
>> > >+#include "rte_meter.h"
>> > > #include "rte_virtio_net.h"
>> > >
>> > > VLOG_DEFINE_THIS_MODULE(dpdk);
>> > >@@ -193,6 +194,11 @@ struct dpdk_ring {
>> > > struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);  };
>> > >
>> > >+struct ingress_policer {
>> > >+struct rte_meter_srtcm_params app_srtcm_params;
>> > >+struct rte_meter_srtcm in_policer; };
>> > >+
>> > > struct netdev_dpdk {
>> > > struct netdev up;
>> > > int port_id;
>> > >@@ -231,6 +237,13 @@ struct netdev_dpdk {
>> > > /* Identifier used to distinguish vhost devices from each other
>> */
>> > > char vhost_id[PATH_MAX];
>> > >
>> > >+/* Ingress Policer */
>> > >+rte_spinlock_t policer_lock;
>> > >+struct ingress_policer *ingress_policer;
>> > >+
>> >
>> > I would prefer not to have a lock at this level.
>> >
>> > I think it would make more sense to make the ingress_policer pointer
>> > RCU protected and embed the spinlock into struct ingress_policer, to
>> > protect the token bucket.
>> >
>> Sure I agree, I was modelling this on what we have currently in QoS just
>> for a rough implementation.
>> I can take a look at using RCU instead. This could possibly be extended
>> to the QoS use case also in the futue.
>
>Hi Daniele, I have been looking at an RCU implementation here but so far
>have not been able to get it working correctly.
>
>The issue I'm seeing is when I destroy the policer while traffic is
>passing a segfault sometimes occurs as the meter is in use when the
>ingress policer is set to NULL.
>
>I'm pretty sure this is down to my understanding (or lack thereof) of the
>ovsrcu behavior.
>
>Is the following high level implementation correct in your eyes?
>
>The ingress policer struct is as follows:
>
>struct ingress_policer {
>struct rte_meter_srtcm_params app_srtcm_params;
>struct rte_meter_srtcm in_policer;
>};
>
>From your comment above you mention embedding the spinlock in the ingress
>policer struct.
>Just to clarify, does the rcu by nature embed a spinlock or did you mean
>move the rte_spinlock policer_lock from the netdev_dpdk struct into the
>ingress policer struct?
>
>Is the behavior you are thinking of something like the following for when
>traffic is being processed?
>
>1. Get the rcu ingress_policer pointer.
>2. Lock the spinlock in the ingress policer struct.
>3. Set the ovsrcu pointer
>4. Call ovsrcu_synchronize to that all threads see that the policer is
>locked (Stop threads from accessing the ingress policer)
>5. Process the packets in the meter as usual.
>6. Unlock the spinlock.
>7. Set the ovsrcu pointer
>6. Synchronize again? (So that threads can access the ingress policer
>again)
>
>For destroying the ingress policer
>
>1. Get the rcu ingress_policer pointer.
>2. Lock the spinlock in the ingress policer struct.
>3. Set the ovsrcu pointer
>4. Call ovsrcu_synchronize to that all threads see that the policer is
>locked (Stop threads from accessing the meter)
>5. Destroy the ingress policer.
>6. Unlock the spinlock - if the spinlock is embedded in the ingress
>policer struct we have a problem here as it cannot  be free now, the
>struct has been destroyed.
>7. Set the ovsrcu pointer
>8. Synchronize again? (So that threads can see the ingress policer point
>for the netdev is now null)
>
>Thanks
>Ian
>

What I had in mind was simpler:

processing traffic:

p = ovsrcu_get(>ingress_policer)
if (p) {
rte_spinlock_lock(>lock);
policer_pkt_handle(p, pkts...);
rte_spinlock_unlock(>unlock);
}

destroying:

ovs_mutex_lock(>mutex);
...
p = ovsrcu_get_protected(>ingress_policer);
ovsrcu_postpone(destroy_policer, p);
ovsrcu_set(>ingress_policer, NULL);
...
ovs_mutex_unlock(>mutex);


static void destroy_policer (struct ingress_policer *p)
{
/*...*/
free(p);
}

My goal was to avoid taking the spinlock unless QoS in configured (p !=
NULL).


The pointer returned by ovsrcu_get() is guaranteed to be valid until the
next grace period, because ovsrcu_postpone() will not call free() until
the next grace period.

Or, from another point of view, after a grace period, when
ovsrcu_postpone() will actually call free(), all the other threads must
see the new value (NULL), so it is safe to reclaim memory.

The spinlock is used just to protect the meter.

Does this make sense?

Hope this helps,

Daniele

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v7 16/16] netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

2016-04-07 Thread Daniele Di Proietto
This introduces in dpif-netdev and netdev-dpdk the first use for the
newly introduce reconfigure netdev call.

When a request to change the number of queues comes, netdev-dpdk will
remember this and notify the upper layer via
netdev_request_reconfigure().

The datapath, instead of periodically calling netdev_set_multiq(), can
detect this and call reconfigure().

This mechanism can also be used to:
* Automatically match the number of rxq with the one provided by qemu
  via the new_device callback.
* Provide a way to change the MTU of dpdk devices at runtime.
* Move a DPDK vhost device to the proper NUMA socket.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c |  69 +-
 lib/netdev-dpdk.c | 195 ++
 lib/netdev-provider.h |  23 +++---
 lib/netdev.c  |  34 +++--
 lib/netdev.h  |   3 +-
 5 files changed, 155 insertions(+), 169 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index fc81741..d9bfe80 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -257,8 +257,6 @@ struct dp_netdev_port {
 unsigned n_rxq; /* Number of elements in 'rxq' */
 struct netdev_rxq **rxq;
 char *type; /* Port type as requested by user. */
-int latest_requested_n_rxq; /* Latest requested from netdev number
-   of rx queues. */
 };
 
 /* Contained by struct dp_netdev_flow's 'stats' member.  */
@@ -1159,20 +1157,26 @@ port_create(const char *devname, const char *open_type, 
const char *type,
 /* There can only be ovs_numa_get_n_cores() pmd threads,
  * so creates a txq for each, and one extra for the non
  * pmd threads. */
-error = netdev_set_multiq(netdev, n_cores + 1,
-  netdev_requested_n_rxq(netdev));
+error = netdev_set_multiq(netdev, n_cores + 1);
 if (error && (error != EOPNOTSUPP)) {
 VLOG_ERR("%s, cannot set multiq", devname);
 goto out;
 }
 }
+
+if (netdev_is_reconf_required(netdev)) {
+error = netdev_reconfigure(netdev);
+if (error) {
+goto out;
+}
+}
+
 port = xzalloc(sizeof *port);
 port->port_no = port_no;
 port->netdev = netdev;
 port->n_rxq = netdev_n_rxq(netdev);
 port->rxq = xcalloc(port->n_rxq, sizeof *port->rxq);
 port->type = xstrdup(type);
-port->latest_requested_n_rxq = netdev_requested_n_rxq(netdev);
 
 for (i = 0; i < port->n_rxq; i++) {
 error = netdev_rxq_open(netdev, >rxq[i], i);
@@ -2453,27 +2457,6 @@ dpif_netdev_operate(struct dpif *dpif, struct dpif_op 
**ops, size_t n_ops)
 }
 }
 
-/* Returns true if the configuration for rx queues is changed. */
-static bool
-pmd_n_rxq_changed(const struct dp_netdev *dp)
-{
-struct dp_netdev_port *port;
-
-ovs_mutex_lock(>port_mutex);
-HMAP_FOR_EACH (port, node, >ports) {
-int requested_n_rxq = netdev_requested_n_rxq(port->netdev);
-
-if (netdev_is_pmd(port->netdev)
-&& port->latest_requested_n_rxq != requested_n_rxq) {
-ovs_mutex_unlock(>port_mutex);
-return true;
-}
-}
-ovs_mutex_unlock(>port_mutex);
-
-return false;
-}
-
 static bool
 cmask_equals(const char *a, const char *b)
 {
@@ -2597,11 +2580,9 @@ static int
 port_reconfigure(struct dp_netdev_port *port)
 {
 struct netdev *netdev = port->netdev;
-int requested_n_rxq = netdev_requested_n_rxq(netdev);
 int i, err;
 
-if (!netdev_is_pmd(port->netdev)
-|| port->latest_requested_n_rxq != requested_n_rxq) {
+if (!netdev_is_reconf_required(netdev)) {
 return 0;
 }
 
@@ -2612,15 +2593,14 @@ port_reconfigure(struct dp_netdev_port *port)
 }
 port->n_rxq = 0;
 
-/* Sets the new rx queue config. */
-err = netdev_set_multiq(port->netdev, ovs_numa_get_n_cores() + 1,
-requested_n_rxq);
+/* Allows 'netdev' to apply the pending configuration changes. */
+err = netdev_reconfigure(netdev);
 if (err && (err != EOPNOTSUPP)) {
-VLOG_ERR("Failed to set dpdk interface %s rx_queue to: %u",
- netdev_get_name(port->netdev), requested_n_rxq);
+VLOG_ERR("Failed to set interface %s new configuration",
+ netdev_get_name(netdev));
 return err;
 }
-/* If the set_multiq() above succeeds, reopens the 'rxq's. */
+/* If the netdev_reconfigure( above succeeds, reopens the 'rxq's. */
 port->rxq = xrealloc(port->rxq, sizeof *port->rxq * netdev_n_rxq(netdev));
 for (i = 0; i < netdev_n_rxq(netdev); i++) {
 err = netdev_rxq_open(netdev, >rxq[i], i);
@@ -2664,6 +2644,22 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
 dp_netdev

[ovs-dev] [PATCH v7 13/16] dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

2016-04-07 Thread Daniele Di Proietto
Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c   | 144 ++--
 lib/dpif-provider.h |   3 +-
 2 files changed, 84 insertions(+), 63 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index fba6592..bf04867 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -224,7 +224,9 @@ struct dp_netdev {
 ovsthread_key_t per_pmd_key;
 
 /* Cpu mask for pin of pmd threads. */
+char *requested_pmd_cmask;
 char *pmd_cmask;
+
 uint64_t last_tnl_conf_seq;
 };
 
@@ -2451,18 +2453,17 @@ dpif_netdev_operate(struct dpif *dpif, struct dpif_op 
**ops, size_t n_ops)
 }
 }
 
-/* Returns true if the configuration for rx queues or cpu mask
- * is changed. */
+/* Returns true if the configuration for rx queues is changed. */
 static bool
-pmd_config_changed(const struct dp_netdev *dp, const char *cmask)
+pmd_n_rxq_changed(const struct dp_netdev *dp)
 {
 struct dp_netdev_port *port;
 
 ovs_mutex_lock(>port_mutex);
 HMAP_FOR_EACH (port, node, >ports) {
-struct netdev *netdev = port->netdev;
-int requested_n_rxq = netdev_requested_n_rxq(netdev);
-if (netdev_is_pmd(netdev)
+int requested_n_rxq = netdev_requested_n_rxq(port->netdev);
+
+if (netdev_is_pmd(port->netdev)
 && port->latest_requested_n_rxq != requested_n_rxq) {
 ovs_mutex_unlock(>port_mutex);
 return true;
@@ -2470,69 +2471,29 @@ pmd_config_changed(const struct dp_netdev *dp, const 
char *cmask)
 }
 ovs_mutex_unlock(>port_mutex);
 
-if (dp->pmd_cmask != NULL && cmask != NULL) {
-return strcmp(dp->pmd_cmask, cmask);
-} else {
-return (dp->pmd_cmask != NULL || cmask != NULL);
+return false;
+}
+
+static bool
+cmask_equals(const char *a, const char *b)
+{
+if (a && b) {
+return !strcmp(a, b);
 }
+
+return a == NULL && b == NULL;
 }
 
-/* Resets pmd threads if the configuration for 'rxq's or cpu mask changes. */
+/* Changes the number or the affinity of pmd threads.  The changes are actually
+ * applied in dpif_netdev_run(). */
 static int
 dpif_netdev_pmd_set(struct dpif *dpif, const char *cmask)
 {
 struct dp_netdev *dp = get_dp_netdev(dpif);
 
-if (pmd_config_changed(dp, cmask)) {
-struct dp_netdev_port *port;
-
-dp_netdev_destroy_all_pmds(dp);
-
-ovs_mutex_lock(>port_mutex);
-HMAP_FOR_EACH (port, node, >ports) {
-struct netdev *netdev = port->netdev;
-int requested_n_rxq = netdev_requested_n_rxq(netdev);
-if (netdev_is_pmd(port->netdev)
-&& port->latest_requested_n_rxq != requested_n_rxq) {
-int i, err;
-
-/* Closes the existing 'rxq's. */
-for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
-netdev_rxq_close(port->rxq[i]);
-port->rxq[i] = NULL;
-}
-port->n_rxq = 0;
-
-/* Sets the new rx queue config.  */
-err = netdev_set_multiq(port->netdev,
-ovs_numa_get_n_cores() + 1,
-requested_n_rxq);
-if (err && (err != EOPNOTSUPP)) {
-VLOG_ERR("Failed to set dpdk interface %s rx_queue to:"
- " %u", netdev_get_name(port->netdev),
- requested_n_rxq);
-ovs_mutex_unlock(>port_mutex);
-return err;
-}
-port->latest_requested_n_rxq = requested_n_rxq;
-/* If the set_multiq() above succeeds, reopens the 'rxq's. */
-port->n_rxq = netdev_n_rxq(port->netdev);
-port->rxq = xrealloc(port->rxq, sizeof *port->rxq * 
port->n_rxq);
-for (i = 0; i < port->n_rxq; i++) {
-netdev_rxq_open(port->netdev, >rxq[i], i);
-}
-}
-}
-/* Reconfigures the cpu mask. */
-ovs_numa_set_cpu_mask(cmask);
-free(dp->pmd_cmask);
-dp->pmd_cmask = cmask ? xstrdup(cmask) : NULL;
-
-/* Restores the non-pmd. */
-dp_netdev_set_nonpmd(dp);
-/* Restores all pmd threads. */
-dp_netdev_reset_pmd_threads(dp);
-ovs_mutex_unlock(>port_mutex);
+if (!cmask_equals(dp->requested_pmd_cmask, cmask)) {
+free(dp->requested_pmd_cmask);
+dp->requested_pmd_cmask = cmask ? xstrdup(cmask) : NULL;
 }
 
 return 0;
@@ -2632,6 +2593,59 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 }
 }
 
+static void
+reconfigure_pmd_threads(struct dp_netdev *dp)
+OVS_REQUIRES(dp->port_mu

[ovs-dev] [PATCH v7 07/16] hmap: Add HMAP_FOR_EACH_POP.

2016-04-07 Thread Daniele Di Proietto
Makes popping each member of the hmap a bit easier.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/cfm.c|  5 ++---
 lib/hmap.h   |  4 
 lib/id-pool.c|  5 ++---
 lib/learning-switch.c|  5 ++---
 lib/netdev-linux.c   |  5 ++---
 lib/odp-util.c   |  7 +++
 ofproto/bond.c   | 10 --
 ofproto/in-band.c|  5 ++---
 ofproto/ofproto-dpif-ipfix.c |  5 ++---
 ofproto/ofproto-dpif-xlate.c |  5 ++---
 ofproto/ofproto.c|  5 ++---
 ofproto/pinsched.c   |  5 ++---
 ovn/controller-vtep/vtep.c   |  5 ++---
 ovn/controller/encaps.c  |  5 ++---
 ovn/controller/lport.c   |  5 ++---
 ovn/controller/ofctrl.c  |  5 ++---
 ovn/controller/physical.c|  4 +---
 ovn/controller/pinctrl.c |  5 ++---
 ovn/lib/expr.c   |  5 ++---
 ovn/northd/ovn-northd.c  | 10 --
 ovsdb/monitor.c  |  5 ++---
 ovsdb/row.c  |  5 ++---
 tests/library.at |  2 +-
 tests/test-hmap.c| 42 ++
 24 files changed, 93 insertions(+), 71 deletions(-)

diff --git a/lib/cfm.c b/lib/cfm.c
index cf1f725..fb077de 100644
--- a/lib/cfm.c
+++ b/lib/cfm.c
@@ -374,7 +374,7 @@ cfm_create(const struct netdev *netdev) OVS_EXCLUDED(mutex)
 void
 cfm_unref(struct cfm *cfm) OVS_EXCLUDED(mutex)
 {
-struct remote_mp *rmp, *rmp_next;
+struct remote_mp *rmp;
 
 if (!cfm) {
 return;
@@ -389,8 +389,7 @@ cfm_unref(struct cfm *cfm) OVS_EXCLUDED(mutex)
 hmap_remove(all_cfms, >hmap_node);
 ovs_mutex_unlock();
 
-HMAP_FOR_EACH_SAFE (rmp, rmp_next, node, >remote_mps) {
-hmap_remove(>remote_mps, >node);
+HMAP_FOR_EACH_POP (rmp, node, >remote_mps) {
 free(rmp);
 }
 
diff --git a/lib/hmap.h b/lib/hmap.h
index 53e75cc..08c4719 100644
--- a/lib/hmap.h
+++ b/lib/hmap.h
@@ -192,6 +192,10 @@ bool hmap_contains(const struct hmap *, const struct 
hmap_node *);
  __VA_ARGS__;   \
  (NODE != OBJECT_CONTAINING(NULL, NODE, MEMBER)) || (NODE = NULL); \
  ASSIGN_CONTAINER(NODE, hmap_next(HMAP, &(NODE)->MEMBER), MEMBER))
+#define HMAP_FOR_EACH_POP(NODE, MEMBER, HMAP)  \
+while (!hmap_is_empty(HMAP)\
+   && (INIT_CONTAINER(NODE, hmap_first(HMAP), MEMBER), 1)  \
+   && (hmap_remove(HMAP, &(NODE)->MEMBER), 1))
 
 static inline struct hmap_node *hmap_first(const struct hmap *);
 static inline struct hmap_node *hmap_next(const struct hmap *,
diff --git a/lib/id-pool.c b/lib/id-pool.c
index 6b93d37..f32c008 100644
--- a/lib/id-pool.c
+++ b/lib/id-pool.c
@@ -69,10 +69,9 @@ id_pool_init(struct id_pool *pool, uint32_t base, uint32_t 
n_ids)
 static void
 id_pool_uninit(struct id_pool *pool)
 {
-struct id_node *id_node, *next;
+struct id_node *id_node;
 
-HMAP_FOR_EACH_SAFE(id_node, next, node, >map) {
-hmap_remove(>map, _node->node);
+HMAP_FOR_EACH_POP(id_node, node, >map) {
 free(id_node);
 }
 
diff --git a/lib/learning-switch.c b/lib/learning-switch.c
index 7c445b2..870192d 100644
--- a/lib/learning-switch.c
+++ b/lib/learning-switch.c
@@ -269,11 +269,10 @@ void
 lswitch_destroy(struct lswitch *sw)
 {
 if (sw) {
-struct lswitch_port *node, *next;
+struct lswitch_port *node;
 
 rconn_destroy(sw->rconn);
-HMAP_FOR_EACH_SAFE (node, next, hmap_node, >queue_numbers) {
-hmap_remove(>queue_numbers, >hmap_node);
+HMAP_FOR_EACH_POP (node, hmap_node, >queue_numbers) {
 free(node);
 }
 shash_destroy(>queue_names);
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index a7d7ac7..2c1ffec 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -3851,10 +3851,9 @@ static void
 htb_tc_destroy(struct tc *tc)
 {
 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
-struct htb_class *hc, *next;
+struct htb_class *hc;
 
-HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, >tc.queues) {
-hmap_remove(>tc.queues, >tc_queue.hmap_node);
+HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, >tc.queues) {
 free(hc);
 }
 tc_destroy(tc);
diff --git a/lib/odp-util.c b/lib/odp-util.c
index b4689cc..3c75379 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -2081,10 +2081,9 @@ odp_portno_names_get(const struct hmap *portno_names, 
odp_port_t port_no)
 void
 odp_portno_names_destroy(struct hmap *portno_names)
 {
-struct odp_portno_names *odp_portno_names, *odp_portno_names_next;
-HMAP_FOR_EACH_SAFE (odp_portno_names, odp_portno_names_next,
-hmap_node, portno_names) {
-hmap_remove(portno_names, _portno_names->hmap_node);
+struct odp_portno_names *odp_po

[ovs-dev] [PATCH v7 03/16] dpif-netdev: Factor out port_create() from do_add_port().

2016-04-07 Thread Daniele Di Proietto
Instead of performing every operation inside do_port_add() it seems
clearer to introduce port_create(), since we already have
port_destroy().

No functional change.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 69 ++-
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 78e4e35..27277e8 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1096,29 +1096,22 @@ hash_port_no(odp_port_t port_no)
 }
 
 static int
-do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
-odp_port_t port_no)
-OVS_REQUIRES(dp->port_mutex)
+port_create(const char *devname, const char *open_type, const char *type,
+odp_port_t port_no, struct dp_netdev_port **portp)
 {
 struct netdev_saved_flags *sf;
 struct dp_netdev_port *port;
-struct netdev *netdev;
 enum netdev_flags flags;
-const char *open_type;
-int error = 0;
-int i, n_open_rxqs = 0;
+struct netdev *netdev;
+int n_open_rxqs = 0;
+int i, error;
 
-/* Reject devices already in 'dp'. */
-if (!get_port_by_name(dp, devname, )) {
-error = EEXIST;
-goto out;
-}
+*portp = NULL;
 
 /* Open and validate network device. */
-open_type = dpif_netdev_port_open_type(dp->class, type);
 error = netdev_open(devname, open_type, );
 if (error) {
-goto out;
+return error;
 }
 /* XXX reject non-Ethernet devices */
 
@@ -1126,7 +1119,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
 if (flags & NETDEV_LOOPBACK) {
 VLOG_ERR("%s: cannot add a loopback device", devname);
 error = EINVAL;
-goto out_close;
+goto out;
 }
 
 if (netdev_is_pmd(netdev)) {
@@ -1135,7 +1128,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
 if (n_cores == OVS_CORE_UNSPEC) {
 VLOG_ERR("%s, cannot get cpu core info", devname);
 error = ENOENT;
-goto out_close;
+goto out;
 }
 /* There can only be ovs_numa_get_n_cores() pmd threads,
  * so creates a txq for each, and one extra for the non
@@ -1144,14 +1137,14 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
   netdev_requested_n_rxq(netdev));
 if (error && (error != EOPNOTSUPP)) {
 VLOG_ERR("%s, cannot set multiq", devname);
-goto out_close;
+goto out;
 }
 }
 port = xzalloc(sizeof *port);
 port->port_no = port_no;
 port->netdev = netdev;
 port->n_rxq = netdev_n_rxq(netdev);
-port->rxq = xmalloc(sizeof *port->rxq * port->n_rxq);
+port->rxq = xcalloc(port->n_rxq, sizeof *port->rxq);
 port->type = xstrdup(type);
 port->latest_requested_n_rxq = netdev_requested_n_rxq(netdev);
 
@@ -1171,12 +1164,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
 }
 port->sf = sf;
 
-cmap_insert(>ports, >node, hash_port_no(port_no));
-
-if (netdev_is_pmd(netdev)) {
-dp_netdev_add_port_to_pmds(dp, port);
-}
-seq_change(dp->port_seq);
+*portp = port;
 
 return 0;
 
@@ -1187,13 +1175,42 @@ out_rxq_close:
 free(port->type);
 free(port->rxq);
 free(port);
-out_close:
-netdev_close(netdev);
+
 out:
+netdev_close(netdev);
 return error;
 }
 
 static int
+do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
+odp_port_t port_no)
+OVS_REQUIRES(dp->port_mutex)
+{
+struct dp_netdev_port *port;
+int error;
+
+/* Reject devices already in 'dp'. */
+if (!get_port_by_name(dp, devname, )) {
+return EEXIST;
+}
+
+error = port_create(devname, dpif_netdev_port_open_type(dp->class, type),
+type, port_no, );
+if (error) {
+return error;
+}
+
+cmap_insert(>ports, >node, hash_port_no(port_no));
+
+if (netdev_is_pmd(port->netdev)) {
+dp_netdev_add_port_to_pmds(dp, port);
+}
+seq_change(dp->port_seq);
+
+return 0;
+}
+
+static int
 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
  odp_port_t *port_nop)
 {
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v7 15/16] netdev: Add reconfigure request mechanism.

2016-04-07 Thread Daniele Di Proietto
A netdev provider, especially a PMD provider (like netdev DPDK) might
not be able to change some of its parameters (such as MTU, or number of
queues) without stopping everything and restarting.

This commit introduces a mechanism that allows a netdev provider to
request a restart (netdev_request_reconfigure()).  The upper layer can
be notified via netdev_wait_reconf_required() and
netdev_is_reconf_required().  After closing all the rxqs the upper layer
can finally call netdev_reconfigure(), to make sure that the new
configuration is in place.

This will be used by next commit to reconfigure rx and tx queues in
netdev-dpdk.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
Tested-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavan...@intel.com>
---
 lib/netdev-bsd.c  |  1 +
 lib/netdev-dpdk.c |  1 +
 lib/netdev-dummy.c|  1 +
 lib/netdev-linux.c|  1 +
 lib/netdev-provider.h | 27 ++-
 lib/netdev-vport.c|  1 +
 lib/netdev.c  | 38 ++
 lib/netdev.h  |  4 
 8 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
index 49c05f4..32e8f74 100644
--- a/lib/netdev-bsd.c
+++ b/lib/netdev-bsd.c
@@ -1536,6 +1536,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum 
netdev_flags off,
 netdev_bsd_arp_lookup, /* arp_lookup */  \
  \
 netdev_bsd_update_flags, \
+NULL, /* reconfigure */  \
  \
 netdev_bsd_rxq_alloc,\
 netdev_bsd_rxq_construct,\
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index c7217ea..635cc74 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2716,6 +2716,7 @@ static const struct dpdk_qos_ops egress_policer_ops = {
 NULL,   /* arp_lookup */  \
   \
 netdev_dpdk_update_flags, \
+NULL,   /* reconfigure */ \
   \
 netdev_dpdk_rxq_alloc,\
 netdev_dpdk_rxq_construct,\
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index a1013ff..09d753f 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -1275,6 +1275,7 @@ static const struct netdev_class dummy_class = {
 NULL,   /* arp_lookup */
 
 netdev_dummy_update_flags,
+NULL,   /* reconfigure */
 
 netdev_dummy_rxq_alloc,
 netdev_dummy_rxq_construct,
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 2c1ffec..1af08f3 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -2806,6 +2806,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum 
netdev_flags off,
 netdev_linux_arp_lookup,\
 \
 netdev_linux_update_flags,  \
+NULL,   /* reconfigure */   \
 \
 netdev_linux_rxq_alloc, \
 netdev_linux_rxq_construct, \
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index cda25eb..853fc44 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -52,6 +52,16 @@ struct netdev {
  * 'netdev''s flags, features, ethernet address, or carrier changes. */
 uint64_t change_seq;
 
+/* A netdev provider might be unable to change some of the device's
+ * parameter (n_rxq, mtu) when the device is in use.  In this case
+ * the provider can notify the upper layer by calling
+ * netdev_request_reconfigure().  The upper layer will react by stopping
+ * the operations on the device and calling netdev_reconfigure() to allow
+ * the configuration changes.  'last_reconfigure_seq' remembers the value
+ * of 'reconfigure_seq' when the last reconfiguration happened. */
+struct seq *reconfigure_seq;
+uint64_t last_reconfigure_seq;
+
 /* The core netdev code initializes these at netdev construction and only
  * provide read-only access to its client.  Netdev implementations may
  * modify them. */
@@ -64,7 +74,7 @@ struct netdev {
 struct ovs_list saved_flags_list; /* Contains "struct netdev_saved_flags". 
*/
 };
 
-static void
+static inline void
 netdev_change_seq_changed(const struct netdev *netdev_)
 {
 struct netdev *netdev = CONST_CAST(struct netdev *, netdev_);
@@ -75,6 +85,12 @@ netdev_change_seq_ch

[ovs-dev] [PATCH v7 12/16] ofproto-dpif: Call dpif_poll_threads_set() before dpif_run()

2016-04-07 Thread Daniele Di Proietto
An upcoming commit will make dpif_poll_threads_set() record the
requested configuration and dpif_run() apply it, so it makes sense to
change the order.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
Tested-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Mark Kavanagh <mark.b.kavan...@intel.com>
---
 ofproto/ofproto-dpif.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index aceb11f..7eff63f 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -536,6 +536,8 @@ type_run(const char *type)
 return 0;
 }
 
+/* This must be called before dpif_run() */
+dpif_poll_threads_set(backer->dpif, pmd_cpu_mask);
 
 if (dpif_run(backer->dpif)) {
 backer->need_revalidate = REV_RECONFIGURE;
@@ -564,8 +566,6 @@ type_run(const char *type)
 udpif_set_threads(backer->udpif, n_handlers, n_revalidators);
 }
 
-dpif_poll_threads_set(backer->dpif, pmd_cpu_mask);
-
 if (backer->need_revalidate) {
 struct ofproto_dpif *ofproto;
 struct simap_node *node;
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v7 14/16] dpif-netdev: Handle errors in reconfigure_pmd_threads().

2016-04-07 Thread Daniele Di Proietto
Errors returned by netdev_set_multiq() and netdev_rxq_open() weren't
handled properly in reconfigure_pmd_threads().  In case of error now we
remove the port from the datapath.

Also, part of the code is moved in a new function, port_reconfigure().

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 78 ++-
 1 file changed, 48 insertions(+), 30 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index bf04867..fc81741 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2593,44 +2593,62 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 }
 }
 
+static int
+port_reconfigure(struct dp_netdev_port *port)
+{
+struct netdev *netdev = port->netdev;
+int requested_n_rxq = netdev_requested_n_rxq(netdev);
+int i, err;
+
+if (!netdev_is_pmd(port->netdev)
+|| port->latest_requested_n_rxq != requested_n_rxq) {
+return 0;
+}
+
+/* Closes the existing 'rxq's. */
+for (i = 0; i < port->n_rxq; i++) {
+netdev_rxq_close(port->rxq[i]);
+port->rxq[i] = NULL;
+}
+port->n_rxq = 0;
+
+/* Sets the new rx queue config. */
+err = netdev_set_multiq(port->netdev, ovs_numa_get_n_cores() + 1,
+requested_n_rxq);
+if (err && (err != EOPNOTSUPP)) {
+VLOG_ERR("Failed to set dpdk interface %s rx_queue to: %u",
+ netdev_get_name(port->netdev), requested_n_rxq);
+return err;
+}
+/* If the set_multiq() above succeeds, reopens the 'rxq's. */
+port->rxq = xrealloc(port->rxq, sizeof *port->rxq * netdev_n_rxq(netdev));
+for (i = 0; i < netdev_n_rxq(netdev); i++) {
+err = netdev_rxq_open(netdev, >rxq[i], i);
+if (err) {
+return err;
+}
+port->n_rxq++;
+}
+
+return 0;
+}
+
 static void
 reconfigure_pmd_threads(struct dp_netdev *dp)
 OVS_REQUIRES(dp->port_mutex)
 {
-struct dp_netdev_port *port;
+struct dp_netdev_port *port, *next;
 
 dp_netdev_destroy_all_pmds(dp);
 
-HMAP_FOR_EACH (port, node, >ports) {
-struct netdev *netdev = port->netdev;
-int requested_n_rxq = netdev_requested_n_rxq(netdev);
-if (netdev_is_pmd(port->netdev)
-&& port->latest_requested_n_rxq != requested_n_rxq) {
-int i, err;
+HMAP_FOR_EACH_SAFE (port, next, node, >ports) {
+int err;
 
-/* Closes the existing 'rxq's. */
-for (i = 0; i < port->n_rxq; i++) {
-netdev_rxq_close(port->rxq[i]);
-port->rxq[i] = NULL;
-}
-port->n_rxq = 0;
-
-/* Sets the new rx queue config. */
-err = netdev_set_multiq(port->netdev, ovs_numa_get_n_cores() + 1,
-requested_n_rxq);
-if (err && (err != EOPNOTSUPP)) {
-VLOG_ERR("Failed to set dpdk interface %s rx_queue to: %u",
- netdev_get_name(port->netdev),
- requested_n_rxq);
-return;
-}
-port->latest_requested_n_rxq = requested_n_rxq;
-/* If the set_multiq() above succeeds, reopens the 'rxq's. */
-port->n_rxq = netdev_n_rxq(port->netdev);
-port->rxq = xrealloc(port->rxq, sizeof *port->rxq * port->n_rxq);
-for (i = 0; i < port->n_rxq; i++) {
-netdev_rxq_open(port->netdev, >rxq[i], i);
-}
+err = port_reconfigure(port);
+if (err) {
+hmap_remove(>ports, >node);
+seq_change(dp->port_seq);
+port_destroy(port);
 }
 }
 /* Reconfigures the cpu mask. */
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v7 11/16] ovs-thread: Do not quiesce in ovs_mutex_cond_wait().

2016-04-07 Thread Daniele Di Proietto
ovs_mutex_cond_wait() is used in many functions in dpif-netdev to
synchronize with pmd threads, but we can't guarantee that the callers do
not hold RCU references, so it's better to avoid quiescing.

In system_stats_thread_func() the code relied on ovs_mutex_cond_wait()
to introduce a quiescent state, so explicit calls to
ovsrcu_quiesce_start() and ovsrcu_quiesce_end() are added there.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/ovs-thread.c| 2 --
 vswitchd/system-stats.c | 6 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c
index 3c065cf..26dd928 100644
--- a/lib/ovs-thread.c
+++ b/lib/ovs-thread.c
@@ -253,9 +253,7 @@ ovs_mutex_cond_wait(pthread_cond_t *cond, const struct 
ovs_mutex *mutex_)
 struct ovs_mutex *mutex = CONST_CAST(struct ovs_mutex *, mutex_);
 int error;
 
-ovsrcu_quiesce_start();
 error = pthread_cond_wait(cond, >lock);
-ovsrcu_quiesce_end();
 
 if (OVS_UNLIKELY(error)) {
 ovs_abort(error, "pthread_cond_wait failed");
diff --git a/vswitchd/system-stats.c b/vswitchd/system-stats.c
index df4971e..129f0cf 100644
--- a/vswitchd/system-stats.c
+++ b/vswitchd/system-stats.c
@@ -37,6 +37,7 @@
 #include "json.h"
 #include "latch.h"
 #include "openvswitch/ofpbuf.h"
+#include "ovs-rcu.h"
 #include "ovs-thread.h"
 #include "poll-loop.h"
 #include "shash.h"
@@ -615,7 +616,12 @@ system_stats_thread_func(void *arg OVS_UNUSED)
 
 ovs_mutex_lock();
 while (!enabled) {
+/* The thread is sleeping, potentially for a long time, and it's
+ * not holding RCU protected references, so it makes sense to
+ * quiesce */
+ovsrcu_quiesce_start();
 ovs_mutex_cond_wait(, );
+ovsrcu_quiesce_end();
 }
 ovs_mutex_unlock();
 
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v7 08/16] dpif-netdev: Add pmd thread local port cache for transmission.

2016-04-07 Thread Daniele Di Proietto
Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 243 +++---
 1 file changed, 175 insertions(+), 68 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 8c5893d..5d1cc43 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -185,6 +185,7 @@ static bool dpcls_lookup(const struct dpcls *cls,
  *
  *dp_netdev_mutex (global)
  *port_mutex
+ *non_pmd_mutex
  */
 struct dp_netdev {
 const struct dpif_class *const class;
@@ -380,6 +381,13 @@ struct rxq_poll {
 struct ovs_list node;
 };
 
+/* Contained by struct dp_netdev_pmd_thread's 'port_cache' or 'tx_ports'. */
+struct tx_port {
+odp_port_t port_no;
+struct netdev *netdev;
+struct hmap_node node;
+};
+
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
  * the performance overhead of interrupt processing.  Therefore netdev can
  * not implement rx-wait for these devices.  dpif-netdev needs to poll
@@ -436,10 +444,18 @@ struct dp_netdev_pmd_thread {
 atomic_int tx_qid;  /* Queue id used by this pmd thread to
  * send packets on all netdevs */
 
-struct ovs_mutex poll_mutex;/* Mutex for poll_list. */
+struct ovs_mutex port_mutex;/* Mutex for 'poll_list' and 'tx_ports'. */
 /* List of rx queues to poll. */
 struct ovs_list poll_list OVS_GUARDED;
-int poll_cnt;   /* Number of elemints in poll_list. */
+/* Number of elements in 'poll_list' */
+int poll_cnt;
+/* Map of 'tx_port's used for transmission.  Written by the main thread,
+ * read by the pmd thread. */
+struct hmap tx_ports OVS_GUARDED;
+
+/* Map of 'tx_port' used in the fast path. This is a thread-local copy
+ * 'tx_ports'. */
+struct hmap port_cache;
 
 /* Only a pmd thread can write on its own 'cycles' and 'stats'.
  * The main thread keeps 'stats_zero' and 'cycles_zero' as base
@@ -495,7 +511,7 @@ dp_netdev_pmd_get_next(struct dp_netdev *dp, struct 
cmap_position *pos);
 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
 static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
 static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id);
-static void dp_netdev_pmd_clear_poll_list(struct dp_netdev_pmd_thread *pmd);
+static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
  struct dp_netdev_port *port);
 static void
@@ -509,6 +525,8 @@ static void dp_netdev_reset_pmd_threads(struct dp_netdev 
*dp);
 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
+static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
+OVS_REQUIRES(pmd->port_mutex);
 
 static inline bool emc_entry_alive(struct emc_entry *ce);
 static void emc_clear_entry(struct emc_entry *ce);
@@ -691,7 +709,7 @@ pmd_info_show_rxq(struct ds *reply, struct 
dp_netdev_pmd_thread *pmd)
 ds_put_format(reply, "pmd thread numa_id %d core_id %u:\n",
   pmd->numa_id, pmd->core_id);
 
-ovs_mutex_lock(>poll_mutex);
+ovs_mutex_lock(>port_mutex);
 LIST_FOR_EACH (poll, node, >poll_list) {
 const char *name = netdev_get_name(poll->port->netdev);
 
@@ -705,7 +723,7 @@ pmd_info_show_rxq(struct ds *reply, struct 
dp_netdev_pmd_thread *pmd)
 ds_put_format(reply, " %d", netdev_rxq_get_queue_id(poll->rx));
 prev_name = name;
 }
-ovs_mutex_unlock(>poll_mutex);
+ovs_mutex_unlock(>port_mutex);
 ds_put_cstr(reply, "\n");
 }
 }
@@ -1078,6 +1096,11 @@ dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
 int old_seq;
 
 if (pmd->core_id == NON_PMD_CORE_ID) {
+ovs_mutex_lock(>dp->non_pmd_mutex);
+ovs_mutex_lock(>port_mutex);
+pmd_load_cached_ports(pmd);
+ovs_mutex_unlock(>port_mutex);
+ovs_mutex_unlock(>dp->non_pmd_mutex);
 return;
 }
 
@@ -1200,9 +1223,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
 
 cmap_insert(>ports, >node, hash_port_no(port_no));
 
-if (netdev_is_pmd(port->netdev)) {
-dp_netdev_add_port_to_pmds(dp, port);
-}
+dp_netdev_add_port_to_pmds(dp, port);
 seq_change(dp->port_seq);
 
 return 0;
@@ -1371,6 +1392,9 @@ do_del_port(struct dp_netdev *dp, struct dp_netdev_port 
*port)
 {
 cmap_remove(>ports, >node, hash_odp_port(port->port_no));
 seq_change(dp->port_seq);
+
+dp_netdev_del_port_from_all_pmds(dp, port);
+
 if (netdev_is_pmd(port->netdev)

[ovs-dev] [PATCH v7 10/16] dpif-netdev: Use hmap for ports.

2016-04-07 Thread Daniele Di Proietto
Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 93 ---
 1 file changed, 55 insertions(+), 38 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 5d1cc43..fba6592 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -198,7 +198,7 @@ struct dp_netdev {
  *
  * Protected by RCU.  Take the mutex to add or remove ports. */
 struct ovs_mutex port_mutex;
-struct cmap ports;
+struct hmap ports;
 struct seq *port_seq;   /* Incremented whenever a port changes. */
 
 /* Protects access to ofproto-dpif-upcall interface during revalidator
@@ -229,7 +229,8 @@ struct dp_netdev {
 };
 
 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
-odp_port_t);
+odp_port_t)
+OVS_REQUIRES(>port_mutex);
 
 enum dp_stat_type {
 DP_STAT_EXACT_HIT,  /* Packets that had an exact match (emc). */
@@ -249,7 +250,7 @@ enum pmd_cycles_counter_type {
 struct dp_netdev_port {
 odp_port_t port_no;
 struct netdev *netdev;
-struct cmap_node node;  /* Node in dp_netdev's 'ports'. */
+struct hmap_node node;  /* Node in dp_netdev's 'ports'. */
 struct netdev_saved_flags *sf;
 unsigned n_rxq; /* Number of elements in 'rxq' */
 struct netdev_rxq **rxq;
@@ -475,9 +476,11 @@ struct dpif_netdev {
 };
 
 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
-  struct dp_netdev_port **portp);
+  struct dp_netdev_port **portp)
+OVS_REQUIRES(dp->port_mutex);
 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
-struct dp_netdev_port **portp);
+struct dp_netdev_port **portp)
+OVS_REQUIRES(dp->port_mutex);
 static void dp_netdev_free(struct dp_netdev *)
 OVS_REQUIRES(dp_netdev_mutex);
 static int do_add_port(struct dp_netdev *dp, const char *devname,
@@ -521,7 +524,8 @@ dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
  struct dp_netdev_port *port, struct netdev_rxq *rx);
 static struct dp_netdev_pmd_thread *
 dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id);
-static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp);
+static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
+OVS_REQUIRES(dp->port_mutex);
 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
@@ -912,7 +916,7 @@ create_dp_netdev(const char *name, const struct dpif_class 
*class,
 atomic_flag_clear(>destroyed);
 
 ovs_mutex_init(>port_mutex);
-cmap_init(>ports);
+hmap_init(>ports);
 dp->port_seq = seq_create();
 fat_rwlock_init(>upcall_rwlock);
 
@@ -983,7 +987,7 @@ static void
 dp_netdev_free(struct dp_netdev *dp)
 OVS_REQUIRES(dp_netdev_mutex)
 {
-struct dp_netdev_port *port;
+struct dp_netdev_port *port, *next;
 
 shash_find_and_delete(_netdevs, dp->name);
 
@@ -992,15 +996,14 @@ dp_netdev_free(struct dp_netdev *dp)
 ovsthread_key_delete(dp->per_pmd_key);
 
 ovs_mutex_lock(>port_mutex);
-CMAP_FOR_EACH (port, node, >ports) {
-/* PMD threads are destroyed here. do_del_port() cannot quiesce */
+HMAP_FOR_EACH_SAFE (port, next, node, >ports) {
 do_del_port(dp, port);
 }
 ovs_mutex_unlock(>port_mutex);
 cmap_destroy(>poll_threads);
 
 seq_destroy(dp->port_seq);
-cmap_destroy(>ports);
+hmap_destroy(>ports);
 ovs_mutex_destroy(>port_mutex);
 
 /* Upcalls must be disabled at this point */
@@ -1221,7 +1224,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, 
const char *type,
 return error;
 }
 
-cmap_insert(>ports, >node, hash_port_no(port_no));
+hmap_insert(>ports, >node, hash_port_no(port_no));
 
 dp_netdev_add_port_to_pmds(dp, port);
 seq_change(dp->port_seq);
@@ -1287,10 +1290,11 @@ is_valid_port_number(odp_port_t port_no)
 
 static struct dp_netdev_port *
 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
+OVS_REQUIRES(>port_mutex)
 {
 struct dp_netdev_port *port;
 
-CMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), >ports) {
+HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), >ports) {
 if (port->port_no == port_no) {
 return port;
 }
@@ -1301,6 +1305,7 @@ dp_netdev_lookup_port(const struct dp_netdev *dp, 
odp_port_t port_no)
 static int
 get_port_by_number(struct dp_netdev *dp,
odp_port_t port_no, struct dp_netdev_port **portp)
+OVS_REQUIR

[ovs-dev] [PATCH v7 09/16] hmap: Use struct for hmap_at_position().

2016-04-07 Thread Daniele Di Proietto
The interface will be more similar to the cmap.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/hmap.c | 26 --
 lib/hmap.h |  7 ++-
 lib/sset.c | 12 +---
 lib/sset.h |  7 ++-
 ofproto/ofproto-dpif.c |  8 +++-
 5 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/lib/hmap.c b/lib/hmap.c
index b70ce51..9462c5e 100644
--- a/lib/hmap.c
+++ b/lib/hmap.c
@@ -236,24 +236,22 @@ hmap_random_node(const struct hmap *hmap)
 }
 
 /* Returns the next node in 'hmap' in hash order, or NULL if no nodes remain in
- * 'hmap'.  Uses '*bucketp' and '*offsetp' to determine where to begin
- * iteration, and stores new values to pass on the next iteration into them
- * before returning.
+ * 'hmap'.  Uses '*pos' to determine where to begin iteration, and updates
+ * '*pos' to pass on the next iteration into them before returning.
  *
  * It's better to use plain HMAP_FOR_EACH and related functions, since they are
  * faster and better at dealing with hmaps that change during iteration.
  *
- * Before beginning iteration, store 0 into '*bucketp' and '*offsetp'.
- */
+ * Before beginning iteration, set '*pos' to all zeros. */
 struct hmap_node *
 hmap_at_position(const struct hmap *hmap,
- uint32_t *bucketp, uint32_t *offsetp)
+ struct hmap_position *pos)
 {
 size_t offset;
 size_t b_idx;
 
-offset = *offsetp;
-for (b_idx = *bucketp; b_idx <= hmap->mask; b_idx++) {
+offset = pos->offset;
+for (b_idx = pos->bucket; b_idx <= hmap->mask; b_idx++) {
 struct hmap_node *node;
 size_t n_idx;
 
@@ -261,11 +259,11 @@ hmap_at_position(const struct hmap *hmap,
  n_idx++, node = node->next) {
 if (n_idx == offset) {
 if (node->next) {
-*bucketp = node->hash & hmap->mask;
-*offsetp = offset + 1;
+pos->bucket = node->hash & hmap->mask;
+pos->offset = offset + 1;
 } else {
-*bucketp = (node->hash & hmap->mask) + 1;
-*offsetp = 0;
+pos->bucket = (node->hash & hmap->mask) + 1;
+pos->offset = 0;
 }
 return node;
 }
@@ -273,8 +271,8 @@ hmap_at_position(const struct hmap *hmap,
 offset = 0;
 }
 
-*bucketp = 0;
-*offsetp = 0;
+pos->bucket = 0;
+pos->offset = 0;
 return NULL;
 }
 
diff --git a/lib/hmap.h b/lib/hmap.h
index 08c4719..9a96c5f 100644
--- a/lib/hmap.h
+++ b/lib/hmap.h
@@ -201,8 +201,13 @@ static inline struct hmap_node *hmap_first(const struct 
hmap *);
 static inline struct hmap_node *hmap_next(const struct hmap *,
   const struct hmap_node *);
 
+struct hmap_position {
+unsigned int bucket;
+unsigned int offset;
+};
+
 struct hmap_node *hmap_at_position(const struct hmap *,
-   uint32_t *bucket, uint32_t *offset);
+   struct hmap_position *);
 
 /* Returns the number of nodes currently in 'hmap'. */
 static inline size_t
diff --git a/lib/sset.c b/lib/sset.c
index f9d4fc0..4fd3fae 100644
--- a/lib/sset.c
+++ b/lib/sset.c
@@ -251,21 +251,19 @@ sset_equals(const struct sset *a, const struct sset *b)
 }
 
 /* Returns the next node in 'set' in hash order, or NULL if no nodes remain in
- * 'set'.  Uses '*bucketp' and '*offsetp' to determine where to begin
- * iteration, and stores new values to pass on the next iteration into them
- * before returning.
+ * 'set'.  Uses '*pos' to determine where to begin iteration, and updates
+ * '*pos' to pass on the next iteration into them before returning.
  *
  * It's better to use plain SSET_FOR_EACH and related functions, since they are
  * faster and better at dealing with ssets that change during iteration.
  *
- * Before beginning iteration, store 0 into '*bucketp' and '*offsetp'.
- */
+ * Before beginning iteration, set '*pos' to all zeros. */
 struct sset_node *
-sset_at_position(const struct sset *set, uint32_t *bucketp, uint32_t *offsetp)
+sset_at_position(const struct sset *set, struct sset_position *pos)
 {
 struct hmap_node *hmap_node;
 
-hmap_node = hmap_at_position(>map, bucketp, offsetp);
+hmap_node = hmap_at_position(>map, >pos);
 return SSET_NODE_FROM_HMAP_NODE(hmap_node);
 }
 
diff --git a/lib/sset.h b/lib/sset.h
index 7d1d496..9c2f703 100644
--- a/lib/sset.h
+++ b/lib/sset.h
@@ -64,8 +64,13 @@ char *sset_pop(struct sset *);
 struct sset_node *sset_find(const struct sset *, const char *);
 bool sset_contains(const struct sset *, const char *);
 bool sset_equals(const struct sset *, const struct sset *);
+
+struct sset_position {
+struct hmap_position pos;
+};
+
 struct sset_node *sset

[ovs-dev] [PATCH v7 04/16] dpif-netdev: Add functions to modify rxq without reloading pmd threads.

2016-04-07 Thread Daniele Di Proietto
This commit introduces some functions to add/remove rxqs from pmd
threads without reloading them.  They will be used by next commits.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 77 ---
 1 file changed, 56 insertions(+), 21 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 27277e8..9c32c64 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -496,8 +496,6 @@ static void dp_netdev_destroy_all_pmds(struct dp_netdev 
*dp);
 static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
 static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id);
 static void dp_netdev_pmd_clear_poll_list(struct dp_netdev_pmd_thread *pmd);
-static void dp_netdev_del_port_from_pmd(struct dp_netdev_port *port,
-struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
  struct dp_netdev_port *port);
 static void
@@ -3003,11 +3001,11 @@ dp_netdev_pmd_clear_poll_list(struct 
dp_netdev_pmd_thread *pmd)
 ovs_mutex_unlock(>poll_mutex);
 }
 
-/* Deletes all rx queues of 'port' from poll_list of pmd thread and
- * reloads it if poll_list was changed. */
-static void
-dp_netdev_del_port_from_pmd(struct dp_netdev_port *port,
-struct dp_netdev_pmd_thread *pmd)
+/* Deletes all rx queues of 'port' from poll_list of pmd thread.  Returns true
+ * if 'port' was found in 'pmd' (therefore a restart is required). */
+static bool
+dp_netdev_del_port_from_pmd__(struct dp_netdev_port *port,
+  struct dp_netdev_pmd_thread *pmd)
 {
 struct rxq_poll *poll, *next;
 bool found = false;
@@ -3022,8 +3020,30 @@ dp_netdev_del_port_from_pmd(struct dp_netdev_port *port,
 }
 }
 ovs_mutex_unlock(>poll_mutex);
-if (found) {
-dp_netdev_reload_pmd__(pmd);
+
+return found;
+}
+
+/* Deletes all rx queues of 'port' from all pmd threads.  The pmd threads that
+ * need to be restarted are inserted in 'to_reload'. */
+static void
+dp_netdev_del_port_from_all_pmds__(struct dp_netdev *dp,
+   struct dp_netdev_port *port,
+   struct hmapx *to_reload)
+{
+int numa_id = netdev_get_numa_id(port->netdev);
+struct dp_netdev_pmd_thread *pmd;
+
+CMAP_FOR_EACH (pmd, node, >poll_threads) {
+if (pmd->numa_id == numa_id) {
+bool found;
+
+found = dp_netdev_del_port_from_pmd__(port, pmd);
+
+if (found) {
+hmapx_add(to_reload, pmd);
+}
+   }
 }
 }
 
@@ -3033,16 +3053,21 @@ static void
 dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
  struct dp_netdev_port *port)
 {
-int numa_id = netdev_get_numa_id(port->netdev);
 struct dp_netdev_pmd_thread *pmd;
+struct hmapx to_reload = HMAPX_INITIALIZER(_reload);
+struct hmapx_node *node;
 
-CMAP_FOR_EACH (pmd, node, >poll_threads) {
-if (pmd->numa_id == numa_id) {
-dp_netdev_del_port_from_pmd(port, pmd);
-   }
+dp_netdev_del_port_from_all_pmds__(dp, port, _reload);
+
+HMAPX_FOR_EACH (node, _reload) {
+pmd = (struct dp_netdev_pmd_thread *) node->data;
+dp_netdev_reload_pmd__(pmd);
 }
+
+hmapx_destroy(_reload);
 }
 
+
 /* Returns PMD thread from this numa node with fewer rx queues to poll.
  * Returns NULL if there is no PMD threads on this numa node.
  * Can be called safely only by main thread. */
@@ -3078,18 +3103,16 @@ dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread 
*pmd,
 pmd->poll_cnt++;
 }
 
-/* Distributes all rx queues of 'port' between all PMD threads and reloads
- * them if needed. */
+/* Distributes all rx queues of 'port' between all PMD threads in 'dp'. The
+ * pmd threads that need to be restarted are inserted in 'to_reload'. */
 static void
-dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct dp_netdev_port *port)
+dp_netdev_add_port_to_pmds__(struct dp_netdev *dp, struct dp_netdev_port *port,
+ struct hmapx *to_reload)
 {
 int numa_id = netdev_get_numa_id(port->netdev);
 struct dp_netdev_pmd_thread *pmd;
-struct hmapx to_reload;
-struct hmapx_node *node;
 int i;
 
-hmapx_init(_reload);
 /* Cannot create pmd threads for invalid numa node. */
 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
 
@@ -3106,8 +3129,20 @@ dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct 
dp_netdev_port *port)
 dp_netdev_add_rxq_to_pmd(pmd, port, port->rxq[i]);
 ovs_mutex_unlock(>poll_mutex);
 
-hmapx_add(_reload, pmd);
+hmapx_add(to_reload, pmd);
 }
+}
+
+/* Distributes all rx queues of 'port' between all PMD threads in 'dp' and
+ * reloads them, if needed. */
+s

[ovs-dev] [PATCH v7 02/16] dpif-netdev: Remove unused 'index' in dp_netdev_pmd_thread.

2016-04-07 Thread Daniele Di Proietto
Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 7959342..78e4e35 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -431,8 +431,6 @@ struct dp_netdev_pmd_thread {
 struct latch exit_latch;/* For terminating the pmd thread. */
 atomic_uint change_seq; /* For reloading pmd ports. */
 pthread_t thread;
-int index;  /* Idx of this pmd thread among pmd*/
-/* threads on same numa node. */
 unsigned core_id;   /* CPU core id of this pmd thread. */
 int numa_id;/* numa node id of this pmd thread. */
 atomic_int tx_qid;  /* Queue id used by this pmd thread to
@@ -486,8 +484,8 @@ static void dp_netdev_recirculate(struct 
dp_netdev_pmd_thread *,
 static void dp_netdev_disable_upcall(struct dp_netdev *);
 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
-struct dp_netdev *dp, int index,
-unsigned core_id, int numa_id);
+struct dp_netdev *dp, unsigned core_id,
+int numa_id);
 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_set_nonpmd(struct dp_netdev *dp);
 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
@@ -2788,8 +2786,7 @@ dp_netdev_set_nonpmd(struct dp_netdev *dp)
 struct dp_netdev_pmd_thread *non_pmd;
 
 non_pmd = xzalloc(sizeof *non_pmd);
-dp_netdev_configure_pmd(non_pmd, dp, 0, NON_PMD_CORE_ID,
-OVS_NUMA_UNSPEC);
+dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
 }
 
 /* Caller must have valid pointer to 'pmd'. */
@@ -2830,10 +2827,9 @@ dp_netdev_pmd_get_next(struct dp_netdev *dp, struct 
cmap_position *pos)
 /* Configures the 'pmd' based on the input argument. */
 static void
 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
-int index, unsigned core_id, int numa_id)
+unsigned core_id, int numa_id)
 {
 pmd->dp = dp;
-pmd->index = index;
 pmd->core_id = core_id;
 pmd->numa_id = numa_id;
 pmd->poll_cnt = 0;
@@ -3141,7 +3137,7 @@ dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int 
numa_id)
 for (i = 0; i < can_have; i++) {
 unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
 pmds[i] = xzalloc(sizeof **pmds);
-dp_netdev_configure_pmd(pmds[i], dp, i, core_id, numa_id);
+dp_netdev_configure_pmd(pmds[i], dp, core_id, numa_id);
 }
 
 /* Distributes rx queues of this numa node between new pmd threads. */
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v7 00/16] Reconfigure netdev at runtime

2016-04-07 Thread Daniele Di Proietto
Currently we treat set_multiq() calls specially in netdev and dpif-netdev:
every pmd thread must be stopped and set_multiq() is allowed to destroy and
recreate the device.

I think we can improve this by:
* Generalizing the mechanism to allow changing other parameters at runtime
  (such as MTU).
* Involving less the above layer (dpif-netdev).  The request for changes
  often comes from below (netdev_dpdk_set_config(), or the vhost new_device()
  callback).  There's no need for dpif-netdev to remember the requested value,
  all that it needs to know is that a configuration change is requested.

This series implements exactly this: a mechanism to allow a netdev provider
to request configuration changes, to which dpif-netdev will respond by
stopping rx/tx and calling a netdev function to appy the new configuration.

The new mechanism is used in this series to replace the set_multiq() call,
but the idea is to use it also at least for:

* Changing the MTU at runtime
* Automatically detecting the number of rx queues for a vhost-user device
* Move a DPDK vhost device to the proper NUMA socket

The first commits refactor some code in dpif-netdev and, most importantly
avoid using RCU for ports.  Each thread will have its local copy of all the
ports in the datapath.

The series is also available here:

https://github.com/ddiproietto/ovs/tree/configchangesv7

v7:
* Dropped already applied patches.
* Stop using RCU for ports.
* Rebased against master.

v6:
* Rebased against master.
* Check return value of netdev_rxq_open().
* Fix comment.

v5:
* Style fixes.
* Fixed a bug in dp_netdev_free() in patch 6.

v4:
* Added another patch to uniform names of variables in netdev-dpdk (no
  functional change)
* Update some netdev comments to document the relation between
  netdev_set_multiq() and netdev_reconfigure()
* Clarify that when netdev_reconfigure() is called no call to netdev_send()
  or netdev_rxq_recv() must be issued.
* Move check to skip reconfiguration in netdev_dpdk_reconfigure() before
  rte_eth_dev_stop().

v3:
* Fixed another outdated comment about rx queue configuration, as pointed out
  by Mark
* Removed unnecessary and buggy initialization of requested_n_rxq in
  reconfigure_pmd_threads().
* Removed unused 'err' variable in netdev_dpdk_set_multiq().
* Changed comparison in netdev_set_multiq() to use previous
  'netdev->requested_n_txq' instead of 'netdev->up.n_txq'
* Return immediately in netdev_dpdk_reconfigure() if configuration didn't
  change anything.

v2:
* Fixed do_add_port(): we have to call netdev_reconfigure() before opening
  the rxqs.  This prevents memory leaks, and makes sure that the datapath
  polls the appropriate number of queues
* Fixed netdev_dpdk_vhost_set_multiq(): it must call
  netdev_request_reconfigure(). Since it is now equal to
  netdev_dpdk_set_multiq(), the two function have been merged.
* Fixed netdev_dpdk_set_config(): dev->requested_n_rxq is now accessed
  while holding the appropriate mutex.
* Fixed some outdated comments about rx queue configuration.

Daniele Di Proietto (16):
  dpif-netdev: Destroy 'port_mutex' in dp_netdev_free().
  dpif-netdev: Remove unused 'index' in dp_netdev_pmd_thread.
  dpif-netdev: Factor out port_create() from do_add_port().
  dpif-netdev: Add functions to modify rxq without reloading pmd
threads.
  dpif-netdev: Fix race condition in pmd thread initialization.
  dpif-netdev: Remove duplicate code in dp_netdev_set_pmds_on_numa().
  hmap: Add HMAP_FOR_EACH_POP.
  dpif-netdev: Add pmd thread local port cache for transmission.
  hmap: Use struct for hmap_at_position().
  dpif-netdev: Use hmap for ports.
  ovs-thread: Do not quiesce in ovs_mutex_cond_wait().
  ofproto-dpif: Call dpif_poll_threads_set() before dpif_run()
  dpif-netdev: Change pmd thread configuration in dpif_netdev_run().
  dpif-netdev: Handle errors in reconfigure_pmd_threads().
  netdev: Add reconfigure request mechanism.
  netdev-dpdk: Use ->reconfigure() call to change rx/tx queues.

 lib/cfm.c|   5 +-
 lib/dpif-netdev.c| 689 +++
 lib/dpif-provider.h  |   3 +-
 lib/hmap.c   |  26 +-
 lib/hmap.h   |  11 +-
 lib/id-pool.c|   5 +-
 lib/learning-switch.c|   5 +-
 lib/netdev-bsd.c |   1 +
 lib/netdev-dpdk.c| 194 ++--
 lib/netdev-dummy.c   |   1 +
 lib/netdev-linux.c   |   6 +-
 lib/netdev-provider.h|  50 +++-
 lib/netdev-vport.c   |   1 +
 lib/netdev.c |  72 +++--
 lib/netdev.h |   7 +-
 lib/odp-util.c   |   7 +-
 lib/ovs-thread.c |   2 -
 lib/sset.c   |  12 +-
 lib/sset.h   |   7 +-
 ofproto/bond.c   |  10 +-
 ofproto/in-band.c|   5 +-
 ofproto/ofproto-dpif-ipfix.c |   5 +-
 ofproto/ofproto-dpif-xlate.c |   5 +-
 ofproto/ofproto-dpif.c   |  12 +-
 ofpro

[ovs-dev] [PATCH v7 01/16] dpif-netdev: Destroy 'port_mutex' in dp_netdev_free().

2016-04-07 Thread Daniele Di Proietto
Found by inspection.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 2870951..7959342 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -987,6 +987,7 @@ dp_netdev_free(struct dp_netdev *dp)
 
 seq_destroy(dp->port_seq);
 cmap_destroy(>ports);
+ovs_mutex_destroy(>port_mutex);
 
 /* Upcalls must be disabled at this point */
 dp_netdev_destroy_upcall_lock(dp);
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v7 06/16] dpif-netdev: Remove duplicate code in dp_netdev_set_pmds_on_numa().

2016-04-07 Thread Daniele Di Proietto
Instead of duplicating code to add ports in
dp_netdev_set_pmds_on_numa(), we can always use
dp_netdev_add_port_to_pmds__().

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 58 +--
 1 file changed, 22 insertions(+), 36 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 2424d3e..8c5893d 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3112,13 +3112,12 @@ dp_netdev_add_port_to_pmds__(struct dp_netdev *dp, 
struct dp_netdev_port *port,
 
 /* Cannot create pmd threads for invalid numa node. */
 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
+dp_netdev_set_pmds_on_numa(dp, numa_id);
 
 for (i = 0; i < port->n_rxq; i++) {
 pmd = dp_netdev_less_loaded_pmd_on_numa(dp, numa_id);
 if (!pmd) {
-/* There is no pmd threads on this numa node. */
-dp_netdev_set_pmds_on_numa(dp, numa_id);
-/* Assigning of rx queues done. */
+VLOG_WARN("There's no pmd thread on numa node %d", numa_id);
 break;
 }
 
@@ -3157,9 +3156,9 @@ dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int 
numa_id)
 int n_pmds;
 
 if (!ovs_numa_numa_id_is_valid(numa_id)) {
-VLOG_ERR("Cannot create pmd threads due to numa id (%d)"
- "invalid", numa_id);
-return ;
+VLOG_WARN("Cannot create pmd threads due to numa id (%d) invalid",
+  numa_id);
+return;
 }
 
 n_pmds = get_n_pmd_threads_on_numa(dp, numa_id);
@@ -3168,46 +3167,25 @@ dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int 
numa_id)
  * in which 'netdev' is on, do nothing.  Else, creates the
  * pmd threads for the numa node. */
 if (!n_pmds) {
-int can_have, n_unpinned, i, index = 0;
-struct dp_netdev_pmd_thread **pmds;
-struct dp_netdev_port *port;
+int can_have, n_unpinned, i;
 
 n_unpinned = ovs_numa_get_n_unpinned_cores_on_numa(numa_id);
 if (!n_unpinned) {
-VLOG_ERR("Cannot create pmd threads due to out of unpinned "
- "cores on numa node %d", numa_id);
+VLOG_WARN("Cannot create pmd threads due to out of unpinned "
+  "cores on numa node %d", numa_id);
 return;
 }
 
 /* If cpu mask is specified, uses all unpinned cores, otherwise
  * tries creating NR_PMD_THREADS pmd threads. */
 can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, 
NR_PMD_THREADS);
-pmds = xzalloc(can_have * sizeof *pmds);
 for (i = 0; i < can_have; i++) {
 unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
-pmds[i] = xzalloc(sizeof **pmds);
-dp_netdev_configure_pmd(pmds[i], dp, core_id, numa_id);
-}
+struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
 
-/* Distributes rx queues of this numa node between new pmd threads. */
-CMAP_FOR_EACH (port, node, >ports) {
-if (netdev_is_pmd(port->netdev)
-&& netdev_get_numa_id(port->netdev) == numa_id) {
-for (i = 0; i < port->n_rxq; i++) {
-/* Make thread-safety analyser happy. */
-ovs_mutex_lock([index]->poll_mutex);
-dp_netdev_add_rxq_to_pmd(pmds[index], port, port->rxq[i]);
-ovs_mutex_unlock([index]->poll_mutex);
-index = (index + 1) % can_have;
-}
-}
-}
-
-/* Actual start of pmd threads. */
-for (i = 0; i < can_have; i++) {
-pmds[i]->thread = ovs_thread_create("pmd", pmd_thread_main, 
pmds[i]);
+dp_netdev_configure_pmd(pmd, dp, core_id, numa_id);
+pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
 }
-free(pmds);
 VLOG_INFO("Created %d pmd threads on numa node %d", can_have, numa_id);
 }
 }
@@ -3219,14 +3197,22 @@ static void
 dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
 {
 struct dp_netdev_port *port;
+struct hmapx to_reload = HMAPX_INITIALIZER(_reload);
+struct hmapx_node *node;
 
 CMAP_FOR_EACH (port, node, >ports) {
 if (netdev_is_pmd(port->netdev)) {
-int numa_id = netdev_get_numa_id(port->netdev);
-
-dp_netdev_set_pmds_on_numa(dp, numa_id);
+dp_netdev_add_port_to_pmds__(dp, port, _reload);
 }
 }
+
+HMAPX_FOR_EACH (node, _reload) {
+struct dp_netdev_pmd_thread *pmd;
+pmd = (struct dp_netdev_pmd_thread *) node->data;
+dp_netdev_reload_pmd__(pmd);
+}
+
+hmapx_destroy(_reload);
 }
 
 static char *
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v7 05/16] dpif-netdev: Fix race condition in pmd thread initialization.

2016-04-07 Thread Daniele Di Proietto
The pmds and the main threads are synchronized using a condition
variable.  The main thread writes a new configuration, then it waits on
the condition variable.  A pmd thread reads the new configuration, then
it calls signal() on the condition variable. To make sure that the pmds
and the main thread have a consistent view, each signal() should be
backed by a wait().

Currently the first signal() doesn't have a corresponding wait().  If
the pmd thread takes a long time to start and the signal() is received
by a later wait, the threads will have an inconsistent view.

The commit fixes the problem by removing the first signal() from the
pmd thread.

This is hardly a problem on current master, because the main thread
will call the first wait() a long time after the creation of a pmd
thread.  It becomes a problem with the next commits.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
---
 lib/dpif-netdev.c | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 9c32c64..2424d3e 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2652,21 +2652,22 @@ dpif_netdev_wait(struct dpif *dpif)
 
 static int
 pmd_load_queues(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **ppoll_list)
-OVS_REQUIRES(pmd->poll_mutex)
 {
 struct rxq_poll *poll_list = *ppoll_list;
 struct rxq_poll *poll;
 int i;
 
+ovs_mutex_lock(>poll_mutex);
 poll_list = xrealloc(poll_list, pmd->poll_cnt * sizeof *poll_list);
 
 i = 0;
 LIST_FOR_EACH (poll, node, >poll_list) {
 poll_list[i++] = *poll;
 }
+ovs_mutex_unlock(>poll_mutex);
 
 *ppoll_list = poll_list;
-return pmd->poll_cnt;
+return i;
 }
 
 static void *
@@ -2685,13 +2686,10 @@ pmd_thread_main(void *f_)
 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
 pmd_thread_setaffinity_cpu(pmd->core_id);
+poll_cnt = pmd_load_queues(pmd, _list);
 reload:
 emc_cache_init(>flow_cache);
 
-ovs_mutex_lock(>poll_mutex);
-poll_cnt = pmd_load_queues(pmd, _list);
-ovs_mutex_unlock(>poll_mutex);
-
 /* List port/core affinity */
 for (i = 0; i < poll_cnt; i++) {
VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
@@ -2699,10 +2697,6 @@ reload:
 netdev_rxq_get_queue_id(poll_list[i].rx));
 }
 
-/* Signal here to make sure the pmd finishes
- * reloading the updated configuration. */
-dp_netdev_pmd_reload_done(pmd);
-
 for (;;) {
 for (i = 0; i < poll_cnt; i++) {
 dp_netdev_process_rxq_port(pmd, poll_list[i].port, 
poll_list[i].rx);
@@ -2725,14 +2719,17 @@ reload:
 }
 }
 
+poll_cnt = pmd_load_queues(pmd, _list);
+/* Signal here to make sure the pmd finishes
+ * reloading the updated configuration. */
+dp_netdev_pmd_reload_done(pmd);
+
 emc_cache_uninit(>flow_cache);
 
 if (!latch_is_set(>exit_latch)){
 goto reload;
 }
 
-dp_netdev_pmd_reload_done(pmd);
-
 free(poll_list);
 return NULL;
 }
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v6 06/12] dpif-netdev: Wait an RCU grace period before freeing ports.

2016-04-07 Thread Daniele Di Proietto


On 01/04/2016 09:52, "Jarno Rajahalme" <ja...@ovn.org> wrote:

>
>> On Mar 30, 2016, at 8:08 PM, Daniele Di Proietto
>><diproiet...@vmware.com> wrote:
>> 
>> 
>> On 30/03/2016 16:01, "Ben Pfaff" <b...@ovn.org> wrote:
>> 
>>> (I'm taking a look at this patch specifically because Daniele asked me;
>>> I'm not planning to review the whole series.)
>>> 
>>> On Mon, Mar 28, 2016 at 12:41:40PM -0700, Daniele Di Proietto wrote:
>>>> The dpif-netdev datapath keeps ports in a cmap which is written only
>>>>by
>>>> the main thread (holding port_mutex), but which is read concurrently
>>>>by
>>>> many threads (most notably the pmd threads).
>>>> 
>>>> When removing ports from the datapath we should postpone the deletion,
>>>> otherwise another thread might access invalid memory while reading the
>>>> cmap.
>>>> 
>>>> This commit splits do_port_del() in do_port_remove() and
>>>> do_port_destroy(): the former removes the port from the cmap, while
>>>>the
>>>> latter reclaims the memory and drops the reference to the underlying
>>>> netdev.
>>> 
>>> s/del_port/port_del/ here:
>> 
>> Thanks, changed
>> 
>>> 
>>>> dpif_netdev_del_port() now uses ovsrcu_synchronize() before calling
>>>> do_port_destroy(), to avoid memory corruption in concurrent readers.
>>> 
>>> ovsrcu_synchronize() requires that nothing in the thread that calls it
>>> is relying on RCU to keep objects around.  That means that no caller of
>>> dfpi_port_del()--there are a few of them--can rely on it.  This is
>>> usually a risky assumption, especially because this assumption can
>>> change later.  Is there reason to believe that it isn't important in
>>>all
>>> of these cases?
>> 
>> I agree that's risky, but I think it's the only way to keep the ports
>>RCU
>> protected, because a port needs to be effectively deleted before
>> dpif_netdev_port_del() can return.
>> 
>
>If this is because otherwise a following port_add can fail, as the old
>port is still around, maybe we could make the highest possible level of
>port_add detect the failure and then rcu_synchronize and try again? Would
>that work?
>
>  Jarno

After some thought I decided to avoid using RCU for ports. I'll send an
updated
series soon.

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH] system-traffic: Fix packet-in format for tests.

2016-04-06 Thread Daniele Di Proietto
Thanks for fixing this!

Acked-by: Daniele Di Proietto <diproiet...@vmware.com>

On 06/04/2016 15:07, "Joe Stringer" <j...@ovn.org> wrote:

>Since continuations were introduced, the system-traffic tests which use
>OpenFlow monitors to check the results of datapath execution have been
>failing, because the new PACKET_IN2 format is used rather than
>PACKET_IN. Switch the expected output over to PACKET_IN2.
>
>Signed-off-by: Joe Stringer <j...@ovn.org>
>---
> tests/system-traffic.at | 8 
> 1 file changed, 4 insertions(+), 4 deletions(-)
>
>diff --git a/tests/system-traffic.at b/tests/system-traffic.at
>index 28adbdcb9ee6..58212c1ed014 100644
>--- a/tests/system-traffic.at
>+++ b/tests/system-traffic.at
>@@ -181,9 +181,9 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 2
>ct\(table=0\) '5054000a50
> 
> dnl Check this output. We only see the latter two packets, not the first.
> AT_CHECK([cat ofctl_monitor.log], [0], [dnl
>-NXT_PACKET_IN (xid=0x0): total_len=42 in_port=1 (via action) data_len=42
>(unbuffered)
>+NXT_PACKET_IN2 (xid=0x0): total_len=42 in_port=1 (via action)
>data_len=42 (unbuffered)
> 
>udp,vlan_tci=0x,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_s
>rc=10.1.1.1,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=0,nw_ttl=0,tp_src=1,tp_dst=2
>udp_csum:0
>-NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=42
>ct_state=est|rpl|trk,in_port=2 (via action) data_len=42 (unbuffered)
>+NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=42
>ct_state=est|rpl|trk,in_port=2 (via action) data_len=42 (unbuffered)
> 
>udp,vlan_tci=0x,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_s
>rc=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=0,tp_src=2,tp_dst=1
>udp_csum:0
> ])
> 
>@@ -925,9 +925,9 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 2
>ct\(table=0\) 'e64c473528c9c6
> 
> dnl Check this output. We only see the latter two packets, not the first.
> AT_CHECK([cat ofctl_monitor.log], [0], [dnl
>-NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=47
>ct_state=new|trk,in_port=1 (via action) data_len=47 (unbuffered)
>+NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=47
>ct_state=new|trk,in_port=1 (via action) data_len=47 (unbuffered)
> 
>udp,vlan_tci=0x,dl_src=e6:4c:47:35:28:c9,dl_dst=c6:f9:4e:cb:72:db,nw_s
>rc=172.16.0.1,nw_dst=172.16.0.2,nw_tos=0,nw_ecn=0,nw_ttl=64,tp_src=41614,t
>p_dst= udp_csum:2096
>-NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=75
>ct_state=rel|rpl|trk,in_port=2 (via action) data_len=75 (unbuffered)
>+NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=75
>ct_state=rel|rpl|trk,in_port=2 (via action) data_len=75 (unbuffered)
> 
>icmp,vlan_tci=0x,dl_src=c6:f9:4e:cb:72:db,dl_dst=e6:4c:47:35:28:c9,nw_
>src=172.16.0.2,nw_dst=172.16.0.1,nw_tos=192,nw_ecn=0,nw_ttl=64,icmp_type=3
>,icmp_code=3 icmp_csum:553f
> ])
> 
>-- 
>2.1.4
>

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v2] dp-packet: Fix use of uninitialised value at emc_lookup.

2016-04-06 Thread Daniele Di Proietto
Thanks for the fix!

I've applied this to master and branch-2.5

2016-04-06 16:28 GMT-07:00 William Tu :
> Valgrind reports "Conditional jump or move depends on uninitialised value"
> and "Use of uninitialised value" at case 2016 ovn -- 3 HVs, 1 LS, 3
> lports/HV.  It is caused by 1) assigning an uninitialized value to 'key->hash'
> at emc_processing(). Due to uninit rss_hash_valid, dp_packet_rss_valid() might
> return true and undefined hash value is returned, and 2) at emc_lookup, the
> 'current_entry->key.hash' could be uninitialized due to dp_packet_clone().
> The patch fixes the two and as a result, a couple of calls to
> dp_packet_rss_valid() become redundant and thus are removed.
>
> Call stacks:
> - Connditional jump or move depends on uninitialised value(s)
> dpif_netdev_packet_get_rss_hash (dpif-netdev.c:3334)
> emc_processing (dpif-netdev.c:3455)
> dp_netdev_input__ (dpif-netdev.c:3639)
> and,
> - Use of uninitialised value of size 8
> emc_lookup (dpif-netdev.c:1785)
> emc_processing (dpif-netdev.c:3457)
> dp_netdev_input__ (dpif-netdev.c:3639)
>
> Signed-off-by: William Tu 
> ---
> v1->v2
> - use dp_packet_rss_invalidate() instead of direct assignment
> - fix dp_packet_clone_with_headroom()
> - remove redundant dp_packet_rss_invalidate()
> ---
>  lib/dp-packet.c| 16 +++-
>  lib/netdev-bsd.c   |  1 -
>  lib/netdev-dummy.c |  1 -
>  lib/netdev-linux.c |  1 -
>  4 files changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/lib/dp-packet.c b/lib/dp-packet.c
> index aec7fe7..0c85d50 100644
> --- a/lib/dp-packet.c
> +++ b/lib/dp-packet.c
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
> + * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2016 Nicira, Inc.
>   *
>   * Licensed under the Apache License, Version 2.0 (the "License");
>   * you may not use this file except in compliance with the License.
> @@ -29,6 +29,7 @@ dp_packet_init__(struct dp_packet *b, size_t allocated, 
> enum dp_packet_source so
>  b->source = source;
>  dp_packet_reset_offsets(b);
>  pkt_metadata_init(>md, 0);
> +dp_packet_rss_invalidate(b);
>  }
>
>  static void
> @@ -167,6 +168,19 @@ dp_packet_clone_with_headroom(const struct dp_packet 
> *buffer, size_t headroom)
>  new_buffer->l3_ofs = buffer->l3_ofs;
>  new_buffer->l4_ofs = buffer->l4_ofs;
>  new_buffer->md = buffer->md;
> +#ifdef DPDK_NETDEV
> +new_buffer->mbuf.ol_flags = buffer->mbuf.ol_flags;
> +#else
> +new_buffer->rss_hash_valid = buffer->rss_hash_valid;
> +#endif
> +
> +if (dp_packet_rss_valid(new_buffer)) {
> +#ifdef DPDK_NETDEV
> +new_buffer->mbuf.hash.rss = buffer->mbuf.hash.rss;
> +#else
> +new_buffer->rss_hash = buffer->rss_hash;
> +#endif
> +}
>
>  return new_buffer;
>  }
> diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
> index 75bd5a3..49c05f4 100644
> --- a/lib/netdev-bsd.c
> +++ b/lib/netdev-bsd.c
> @@ -641,7 +641,6 @@ netdev_bsd_rxq_recv(struct netdev_rxq *rxq_, struct 
> dp_packet **packets,
>  dp_packet_delete(packet);
>  } else {
>  dp_packet_pad(packet);
> -dp_packet_rss_invalidate(packet);
>  packets[0] = packet;
>  *c = 1;
>  }
> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
> index edc86fa..a1013ff 100644
> --- a/lib/netdev-dummy.c
> +++ b/lib/netdev-dummy.c
> @@ -905,7 +905,6 @@ netdev_dummy_rxq_recv(struct netdev_rxq *rxq_, struct 
> dp_packet **arr,
>  ovs_mutex_unlock(>mutex);
>
>  dp_packet_pad(packet);
> -dp_packet_rss_invalidate(packet);
>
>  arr[0] = packet;
>  *c = 1;
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 3f5b608..a7d7ac7 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -1116,7 +1116,6 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct 
> dp_packet **packets,
>  dp_packet_delete(buffer);
>  } else {
>  dp_packet_pad(buffer);
> -dp_packet_rss_invalidate(buffer);
>  packets[0] = buffer;
>  *c = 1;
>  }
> --
> 2.5.0
>
> ___
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH] dp-packet: Fix use of uninitialised value at emc_lookup.

2016-04-06 Thread Daniele Di Proietto
2016-04-06 10:09 GMT-07:00 Darrell Ball :
> On Wed, Apr 6, 2016 at 9:37 AM, William Tu  wrote:
>
>> Valgrind reports "Conditional jump or move depends on uninitialised value"
>> and "Use of uninitialised value" at case 2016 ovn -- 3 HVs, 1 LS, 3
>> lports/HV.  It is caused by reading uninitialized 'key->hash' at
>> emc_lookup()
>> and 'rss_hash_valid' from dp_packet_rss_valid(). At emc_processing(),
>> the value of key->hash is initilized by dpif_netdev_packet_get_rss_hash(),
>> which returns an uninitialized hash value.  Call stacks below:
>>
>> - Connditional jump or move depends on uninitialised value(s)
>> dpif_netdev_packet_get_rss_hash (dpif-netdev.c:3334)
>> emc_processing (dpif-netdev.c:3455)
>> dp_netdev_input__ (dpif-netdev.c:3639)
>> and,
>> - Use of uninitialised value of size 8
>> emc_lookup (dpif-netdev.c:1785)
>> emc_processing (dpif-netdev.c:3457)
>> dp_netdev_input__ (dpif-netdev.c:3639)
>>
>> Signed-off-by: William Tu 
>> ---
>>  lib/dp-packet.c | 9 -
>>  1 file changed, 8 insertions(+), 1 deletion(-)
>>
>> diff --git a/lib/dp-packet.c b/lib/dp-packet.c
>> index aec7fe7..87ed329 100644
>> --- a/lib/dp-packet.c
>> +++ b/lib/dp-packet.c
>> @@ -1,5 +1,5 @@
>>  /*
>> - * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
>> + * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2016 Nicira, Inc.
>>   *
>>   * Licensed under the Apache License, Version 2.0 (the "License");
>>   * you may not use this file except in compliance with the License.
>> @@ -29,6 +29,13 @@ dp_packet_init__(struct dp_packet *b, size_t allocated,
>> enum dp_packet_source so
>>  b->source = source;
>>  dp_packet_reset_offsets(b);
>>  pkt_metadata_init(>md, 0);
>> +#ifdef DPDK_NETDEV
>> +b->mbuf.ol_flags &= ~PKT_RX_RSS_HASH;
>> +b->mbuf.hash.rss = 0;
>> +#else
>> +b->rss_hash_valid = false;
>> +b->rss_hash = 0;
>> +#endif
>>
>
>
> Just a general comment, not a review:
>
> Do you need to set the hash value to zero as well as set
> the "hash_valid" flag to false; should not setting the "hash_valid"
> flag to false be enough to handle a  initialization issue ?
>
> I think there is already an API for setting "hash_valid"
> to false here
>
> static inline void
> dp_packet_rss_invalidate(struct dp_packet *p)
> {
> #ifdef DPDK_NETDEV
> p->mbuf.ol_flags &= ~PKT_RX_RSS_HASH;
> #else
> p->rss_hash_valid = false;
> #endif
> }
>
>

I agree with Darrell, I think it's better to use dp_packet_rss_invalidate().

Also, if we include dp_packet_rss_invalidate() in dp_packet_init__(),
we will have redundant calls to dp_packet_rss_invalidate() in
netdev-{bsd,dummy,linux}.c. Would you mind removing those? There's
another one in netdev-dpdk.c, but that will be requires anyway.

Would you mind sending a v2 with the suggested changes?

Thanks!
___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v6 06/12] dpif-netdev: Wait an RCU grace period before freeing ports.

2016-04-01 Thread Daniele Di Proietto


On 01/04/2016 09:52, "Jarno Rajahalme" <ja...@ovn.org> wrote:

>
>> On Mar 30, 2016, at 8:08 PM, Daniele Di Proietto
>><diproiet...@vmware.com> wrote:
>> 
>> 
>> On 30/03/2016 16:01, "Ben Pfaff" <b...@ovn.org> wrote:
>> 
>>> (I'm taking a look at this patch specifically because Daniele asked me;
>>> I'm not planning to review the whole series.)
>>> 
>>> On Mon, Mar 28, 2016 at 12:41:40PM -0700, Daniele Di Proietto wrote:
>>>> The dpif-netdev datapath keeps ports in a cmap which is written only
>>>>by
>>>> the main thread (holding port_mutex), but which is read concurrently
>>>>by
>>>> many threads (most notably the pmd threads).
>>>> 
>>>> When removing ports from the datapath we should postpone the deletion,
>>>> otherwise another thread might access invalid memory while reading the
>>>> cmap.
>>>> 
>>>> This commit splits do_port_del() in do_port_remove() and
>>>> do_port_destroy(): the former removes the port from the cmap, while
>>>>the
>>>> latter reclaims the memory and drops the reference to the underlying
>>>> netdev.
>>> 
>>> s/del_port/port_del/ here:
>> 
>> Thanks, changed
>> 
>>> 
>>>> dpif_netdev_del_port() now uses ovsrcu_synchronize() before calling
>>>> do_port_destroy(), to avoid memory corruption in concurrent readers.
>>> 
>>> ovsrcu_synchronize() requires that nothing in the thread that calls it
>>> is relying on RCU to keep objects around.  That means that no caller of
>>> dfpi_port_del()--there are a few of them--can rely on it.  This is
>>> usually a risky assumption, especially because this assumption can
>>> change later.  Is there reason to believe that it isn't important in
>>>all
>>> of these cases?
>> 
>> I agree that's risky, but I think it's the only way to keep the ports
>>RCU
>> protected, because a port needs to be effectively deleted before
>> dpif_netdev_port_del() can return.
>> 
>
>If this is because otherwise a following port_add can fail, as the old
>port is still around, maybe we could make the highest possible level of
>port_add detect the failure and then rcu_synchronize and try again? Would
>that work?
>
>  Jarno

That would work for deleting the port, but there are other reasons we need
to synchronize.  When a netdev has to be reconfigured (in the last patch
of the series) and we remove it from the cmap, we need to synchronize to
make sure that other threads have stopped using it.

I'm trying to add some compile-time RCU checks using clang thread safety
annotations, but for those to be effective we have to introduce
ovsrcu_read_lock() and ovsrcu_read_unlock() on every block that keeps RCU
references and I'm not sure we want to go down that path.

I've also remembered that dpif_netdev_port_add() and
dpif_netdev_port_del() might already quiesce, because they could call
ovs_mutex_cond_wait().  I'll try to post a patch to fix that, if we
believe it's an issue.

If ovsrcu_synchronize() is not an acceptable solution, I guess we should
just use an hmap for ports and have pmdthread-local copies.  This means
that every port_add or port_del (even for non DPDK ports) would need to
stop every pmd thread, but I guess there's no way around it.

Thanks,

Daniele

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v6 06/12] dpif-netdev: Wait an RCU grace period before freeing ports.

2016-03-30 Thread Daniele Di Proietto

On 30/03/2016 16:01, "Ben Pfaff" <b...@ovn.org> wrote:

>(I'm taking a look at this patch specifically because Daniele asked me;
>I'm not planning to review the whole series.)
>
>On Mon, Mar 28, 2016 at 12:41:40PM -0700, Daniele Di Proietto wrote:
>> The dpif-netdev datapath keeps ports in a cmap which is written only by
>> the main thread (holding port_mutex), but which is read concurrently by
>> many threads (most notably the pmd threads).
>> 
>> When removing ports from the datapath we should postpone the deletion,
>> otherwise another thread might access invalid memory while reading the
>> cmap.
>> 
>> This commit splits do_port_del() in do_port_remove() and
>> do_port_destroy(): the former removes the port from the cmap, while the
>> latter reclaims the memory and drops the reference to the underlying
>> netdev.
>
>s/del_port/port_del/ here:

Thanks, changed

>
>> dpif_netdev_del_port() now uses ovsrcu_synchronize() before calling
>> do_port_destroy(), to avoid memory corruption in concurrent readers.
>
>ovsrcu_synchronize() requires that nothing in the thread that calls it
>is relying on RCU to keep objects around.  That means that no caller of
>dfpi_port_del()--there are a few of them--can rely on it.  This is
>usually a risky assumption, especially because this assumption can
>change later.  Is there reason to believe that it isn't important in all
>of these cases?

I agree that's risky, but I think it's the only way to keep the ports RCU
protected, because a port needs to be effectively deleted before
dpif_netdev_port_del() can return.


I think it will not be too risky because the code that calls
dpif_netdev_port_del() is high level code that doesn't deal with RCU
protected pointers.  Of course, things might change.

One way to improve the situation would be to use thread-safety annotation
to mark all the functions that might quiesce: I've tried doing that, but
a lot of functions need to get tagged, like ovs_mutex_cond_wait(), which
we already use in the userspace datapath.

An easy way to get rid of this problem would be to avoid RCU for ports and
storing them in an hmap, but I would like to avoid that.

I'll keep thinking about this, maybe we can come up with a better idea.

Thanks,

Daniele

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH] dpif-netdev: Remove PMD latency on seq_mutex

2016-03-29 Thread Daniele Di Proietto


On 29/03/2016 06:44, "Karl Rister" <kris...@redhat.com> wrote:

>On 03/29/2016 08:08 AM, Flavio Leitner wrote:
>> On Tue, Mar 29, 2016 at 02:13:18AM +, Daniele Di Proietto wrote:
>>> Hi Flavio and Karl,
>>>
>>> thanks for the patch! I have a couple of comments:
>>>
>>> Can you point out a configuration where this is the bottleneck?
>>> I'm interested in reproducing this.
>> 
>> Karl, since you did the tests, could you please provide more details?
>
>When performing packet forwarding latency tests, I first noticed system
>and idle time when looking at CPU statistics when I expected the PMD
>threads to be 100% in userspace.  I used the kernel ftrace facility to
>track down what was happening and saw that the PMD thread was being
>context switched out and going idle.  The PMD thread was pinned to CPU
>core thread isolated with isolcpus so there was no competing task that
>could be scheduled to cause the context switch and I would not expect
>the polling thread to ever go idle.  Further analysis with frace and gdb
>tracked the cause to seq_mutex blocking when another task held the mutex.
>
>I would estimate that this change removed packet latency spikes of 35-45
>usecs in our test scenario.
>
>The test is forwarding packets through a KVM guest using OVS+DPDK in the
>host and the DPDK testpmd application in the guest.

Thanks the explanation

>
>Flavio, I thought I remembered you also saying that you saw a throughput
>improvement in a test you were running?
>
>> 
>> 
>>> I think the implementation would look simpler if we could
>>> avoid explicitly taking the mutex in dpif-netdev and instead
>>> having a ovsrcu_try_quiesce(). What do you think?
>> 
>> My concern is that it is freeing one entry from EMC each round
>> and it should quiesce to allow the callbacks to run.  If, for
>> some reason, it fails to quiesce for a long period, then it might
>> backlog a significant number of entries.
>
>My initial approach, which Flavio's code is very similar to, was simply
>trying to provide the simplest change to achieve what I was looking for.
> I could certainly see alternative solutions being more appropriate.
>
>> 
>> 
>>> I think we can avoid the recursive mutex as well if we introduce
>>> some explicit APIs in seq (seq_try_lock, seq_change_protected and
>>> seq_unlock), but I'd like to understand the performance implication
>>> of this commit first.
>
>One other area of the sequence code that I thought was curious was a
>single mutex that covered all sequences.  If updating the API is a
>possibility I would think going to a mutex per sequence might be an
>appropriate change as well.  That said, I don't have data that
>specifically points this out as a problem.

If we find this to be a bottleneck I think we can have a finer-grained
locking.

>
>> 
>> The issue is the latency spike when the PMD thread blocks on the
>> busy mutex.
>> 
>> The goal with recursive locking is to make sure we can sweep
>> the EMC cache and quiesce without blocking.  Fixing seq API
>> would help to not block, but then we have no control to whether
>> we did both tasks in the same round.
>> 
>> fbl
>> 
>> 
>>>
>>> On 23/03/2016 20:54, "dev on behalf of Flavio Leitner"
>>> <dev-boun...@openvswitch.org on behalf of f...@redhat.com> wrote:
>>>
>>>> The PMD thread needs to keep processing RX queues in order
>>>> archive maximum throughput.  However, it also needs to run
>>>> other tasks after some time processing the RX queues which
>>>> a mutex can block the PMD thread.  That causes latency
>>>> spikes and affects the throughput.
>>>>
>>>> Convert to recursive mutex so that PMD thread can test first
>>>> and if it gets the lock, continue as before, otherwise try
>>>> again another time.  There is an additional logic to make
>>>> sure the PMD thread will try harder as the attempt to get
>>>> the mutex continues to fail.
>>>>
>>>> Co-authored-by: Karl Rister <kris...@redhat.com>
>>>> Signed-off-by: Flavio Leitner <f...@redhat.com>
>>>
>>> Oh, we're going to need a signoff from Karl as well :-)
>
>Signed-off-by: Karl Rister <kris...@redhat.com>
>
>Is this good enough?

Absolutely, thanks!

>
>>>
>>> Thanks,
>>>
>>> Daniele
>>>
>>>> ---
>>>> include/openvswitch/thread.h |  3 +++
>>>> lib/dpif-netdev.c| 33 +

Re: [ovs-dev] [PATCH] dpif-netdev: Remove PMD latency on seq_mutex

2016-03-29 Thread Daniele Di Proietto


On 29/03/2016 06:08, "Flavio Leitner" <f...@redhat.com> wrote:

>On Tue, Mar 29, 2016 at 02:13:18AM +, Daniele Di Proietto wrote:
>> Hi Flavio and Karl,
>> 
>> thanks for the patch! I have a couple of comments:
>> 
>> Can you point out a configuration where this is the bottleneck?
>> I'm interested in reproducing this.
>
>Karl, since you did the tests, could you please provide more details?
>
>
>> I think the implementation would look simpler if we could
>> avoid explicitly taking the mutex in dpif-netdev and instead
>> having a ovsrcu_try_quiesce(). What do you think?
>
>My concern is that it is freeing one entry from EMC each round
>and it should quiesce to allow the callbacks to run.  If, for
>some reason, it fails to quiesce for a long period, then it might
>backlog a significant number of entries.
>
>
>> I think we can avoid the recursive mutex as well if we introduce
>> some explicit APIs in seq (seq_try_lock, seq_change_protected and
>> seq_unlock), but I'd like to understand the performance implication
>> of this commit first.
>
>The issue is the latency spike when the PMD thread blocks on the
>busy mutex.
>
>The goal with recursive locking is to make sure we can sweep
>the EMC cache and quiesce without blocking.  Fixing seq API
>would help to not block, but then we have no control to whether
>we did both tasks in the same round.
>
>fbl

If I understand your concerns correctly, I think we can have something
like:

if (ovsrcu_try_quiesce()) {
...
emc_cache_slow_sweep();
...
}

Sure, the swept flows will need to wait another round to actually get
freed,
but I think this is ok

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v3] netdev-dpdk: vhost: Fix txq enabling in the absence of notifications.

2016-03-29 Thread Daniele Di Proietto
Thanks!

I applied this to master and branch-2.5

On 28/03/2016 23:20, "Ilya Maximets"  wrote:

>According to QEMU documentation (docs/specs/vhost-user.txt) one queue
>should be enabled initially. More queues are enabled dynamically, by
>sending message VHOST_USER_SET_VRING_ENABLE.
>
>Currently all queues in OVS disabled by default. This breaks above
>specification. So, queue #0 should be enabled by default to support
>QEMU versions less than 2.5 and fix probable issues if QEMU will not
>send VHOST_USER_SET_VRING_ENABLE for queue #0 according to documentation.
>Also this will fix currently broken vhost-cuse support in OVS.
>
>Fixes: 585a5beaa2a4 ("netdev-dpdk: vhost-user: Fix sending packets to
>  queues not enabled by guest.")
>Reported-by: Mauricio Vasquez B
>
>Signed-off-by: Ilya Maximets 
>---
>
>version 3:
>   * Fixed qid checking in __netdev_dpdk_vhost_send()
>
>version 2:
>   * Fixed initialization in netdev_dpdk_alloc_txq().
>   * Clearing moved to separate function.
>
> lib/netdev-dpdk.c | 28 
> 1 file changed, 24 insertions(+), 4 deletions(-)
>
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index 7c4cd07..8eea788 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -103,6 +103,9 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF /
>ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
> #define NIC_PORT_TX_Q_SIZE 2048  /* Size of Physical NIC TX Queue, Max
>(n+32<=4096)*/
> 
> #define OVS_VHOST_MAX_QUEUE_NUM 1024  /* Maximum number of vHost TX
>queues. */
>+#define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
>+#define OVS_VHOST_QUEUE_DISABLED(-2) /* Queue was disabled by guest
>and not
>+  * yet mapped to another queue.
>*/
> 
> static char *cuse_dev_name = NULL;/* Character device cuse_dev_name.
>*/
> static char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets
>*/
>@@ -671,7 +674,7 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *netdev,
>unsigned int n_txqs)
> }
> 
> /* Initialize map for vhost devices. */
>-netdev->tx_q[i].map = -1;
>+netdev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
> rte_spinlock_init(>tx_q[i].tx_lock);
> }
> }
>@@ -1265,7 +1268,7 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int
>qid,
> 
> qid = vhost_dev->tx_q[qid % vhost_dev->real_n_txq].map;
> 
>-if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid == -1)) {
>+if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid < 0)) {
> rte_spinlock_lock(_dev->stats_lock);
> vhost_dev->stats.tx_dropped+= cnt;
> rte_spinlock_unlock(_dev->stats_lock);
>@@ -2019,7 +2022,7 @@ netdev_dpdk_remap_txqs(struct netdev_dpdk *netdev)
> }
> 
> if (n_enabled == 0 && total_txqs != 0) {
>-enabled_queues[0] = -1;
>+enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
> n_enabled = 1;
> }
> 
>@@ -2056,6 +2059,10 @@ netdev_dpdk_vhost_set_queues(struct netdev_dpdk
>*netdev, struct virtio_net *dev)
> netdev->real_n_rxq = qp_num;
> netdev->real_n_txq = qp_num;
> netdev->txq_needs_locking = true;
>+/* Enable TX queue 0 by default if it wasn't disabled. */
>+if (netdev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
>+netdev->tx_q[0].map = 0;
>+}
> 
> netdev_dpdk_remap_txqs(netdev);
> 
>@@ -2104,6 +2111,18 @@ new_device(struct virtio_net *dev)
> return 0;
> }
> 
>+/* Clears mapping for all available queues of vhost interface. */
>+static void
>+netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
>+OVS_REQUIRES(dev->mutex)
>+{
>+int i;
>+
>+for (i = 0; i < dev->real_n_txq; i++) {
>+dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
>+}
>+}
>+
> /*
>  * Remove a virtio-net device from the specific vhost port.  Use
>dev->remove
>  * flag to stop any more packets from being sent or received to/from a
>VM and
>@@ -2123,6 +2142,7 @@ destroy_device(volatile struct virtio_net *dev)
> ovs_mutex_lock(_dev->mutex);
> dev->flags &= ~VIRTIO_DEV_RUNNING;
> ovsrcu_set(_dev->virtio_dev, NULL);
>+netdev_dpdk_txq_map_clear(vhost_dev);
> exists = true;
> ovs_mutex_unlock(_dev->mutex);
> break;
>@@ -2169,7 +2189,7 @@ vring_state_changed(struct virtio_net *dev,
>uint16_t queue_id, int enable)
> if (enable) {
> vhost_dev->tx_q[qid].map = qid;
> } else {
>-vhost_dev->tx_q[qid].map = -1;
>+vhost_dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
> }
> netdev_dpdk_remap_txqs(vhost_dev);
> exists = true;
>-- 
>2.5.0
>

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH] dpif-netdev: Remove PMD latency on seq_mutex

2016-03-28 Thread Daniele Di Proietto
Hi Flavio and Karl,

thanks for the patch! I have a couple of comments:

Can you point out a configuration where this is the bottleneck?
I'm interested in reproducing this.

I think the implementation would look simpler if we could
avoid explicitly taking the mutex in dpif-netdev and instead
having a ovsrcu_try_quiesce(). What do you think?

I think we can avoid the recursive mutex as well if we introduce
some explicit APIs in seq (seq_try_lock, seq_change_protected and
seq_unlock), but I'd like to understand the performance implication
of this commit first.

On 23/03/2016 20:54, "dev on behalf of Flavio Leitner"
 wrote:

>The PMD thread needs to keep processing RX queues in order
>archive maximum throughput.  However, it also needs to run
>other tasks after some time processing the RX queues which
>a mutex can block the PMD thread.  That causes latency
>spikes and affects the throughput.
>
>Convert to recursive mutex so that PMD thread can test first
>and if it gets the lock, continue as before, otherwise try
>again another time.  There is an additional logic to make
>sure the PMD thread will try harder as the attempt to get
>the mutex continues to fail.
>
>Co-authored-by: Karl Rister 
>Signed-off-by: Flavio Leitner 

Oh, we're going to need a signoff from Karl as well :-)

Thanks,

Daniele

>---
> include/openvswitch/thread.h |  3 +++
> lib/dpif-netdev.c| 33 ++---
> lib/seq.c| 15 ++-
> lib/seq.h|  3 +++
> 4 files changed, 42 insertions(+), 12 deletions(-)
>
>diff --git a/include/openvswitch/thread.h b/include/openvswitch/thread.h
>index af6f2bb..6d20720 100644
>--- a/include/openvswitch/thread.h
>+++ b/include/openvswitch/thread.h
>@@ -44,6 +44,9 @@ struct OVS_LOCKABLE ovs_mutex {
> #define OVS_ADAPTIVE_MUTEX_INITIALIZER OVS_MUTEX_INITIALIZER
> #endif
> 
>+#define OVS_RECURSIVE_MUTEX_INITIALIZER \
>+   { PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP, "" }
>+
> /* ovs_mutex functions analogous to pthread_mutex_*() functions.
>  *
>  * Most of these functions abort the process with an error message on any
>diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>index 0f2385a..a10b2d1 100644
>--- a/lib/dpif-netdev.c
>+++ b/lib/dpif-netdev.c
>@@ -2668,12 +2668,15 @@ static void *
> pmd_thread_main(void *f_)
> {
> struct dp_netdev_pmd_thread *pmd = f_;
>-unsigned int lc = 0;
>+unsigned int lc_max = 1024;
>+unsigned int lc_start;
>+unsigned int lc;
> struct rxq_poll *poll_list;
> unsigned int port_seq = PMD_INITIAL_SEQ;
> int poll_cnt;
> int i;
> 
>+lc_start = 0;
> poll_cnt = 0;
> poll_list = NULL;
> 
>@@ -2698,24 +2701,32 @@ reload:
>  * reloading the updated configuration. */
> dp_netdev_pmd_reload_done(pmd);
> 
>+lc = lc_start;
> for (;;) {
> for (i = 0; i < poll_cnt; i++) {
> dp_netdev_process_rxq_port(pmd, poll_list[i].port,
>poll_list[i].rx);
> }
> 
>-if (lc++ > 1024) {
>-unsigned int seq;
>+if (lc++ > lc_max) {
>+if (!seq_pmd_trylock()) {
>+unsigned int seq;
>+lc_start = 0;
>+lc = 0;
> 
>-lc = 0;
>+emc_cache_slow_sweep(>flow_cache);
>+coverage_try_clear();
>+ovsrcu_quiesce();
> 
>-emc_cache_slow_sweep(>flow_cache);
>-coverage_try_clear();
>-ovsrcu_quiesce();
>+seq_pmd_unlock();
> 
>-atomic_read_relaxed(>change_seq, );
>-if (seq != port_seq) {
>-port_seq = seq;
>-break;
>+atomic_read_relaxed(>change_seq, );
>+if (seq != port_seq) {
>+port_seq = seq;
>+break;
>+}
>+} else {
>+lc_start += (lc_start + lc_max)/2;
>+lc = lc_start;
> }
> }
> }
>diff --git a/lib/seq.c b/lib/seq.c
>index 9c3257c..198b2ce 100644
>--- a/lib/seq.c
>+++ b/lib/seq.c
>@@ -55,7 +55,7 @@ struct seq_thread {
> bool waiting OVS_GUARDED;/* True if latch_wait() already
>called. */
> };
> 
>-static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;
>+static struct ovs_mutex seq_mutex = OVS_RECURSIVE_MUTEX_INITIALIZER;
> 
> static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
> 
>@@ -68,6 +68,19 @@ static void seq_thread_woke(struct seq_thread *)
>OVS_REQUIRES(seq_mutex);
> static void seq_waiter_destroy(struct seq_waiter *)
>OVS_REQUIRES(seq_mutex);
> static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
> 
>+
>+int seq_pmd_trylock(void)
>+ OVS_TRY_LOCK(0, seq_mutex)
>+{
>+  return ovs_mutex_trylock(_mutex);
>+}
>+
>+void seq_pmd_unlock(void)
>+OVS_RELEASES(seq_mutex)
>+{
>+  ovs_mutex_unlock(_mutex);
>+}
>+
> /* Creates and returns a 

Re: [ovs-dev] [PATCH v2] netdev-dpdk: vhost: Fix txq enabling in the absence of notifications.

2016-03-28 Thread Daniele Di Proietto
Thanks for the patch, Ilya

In __netdev_dpdk_vhost_send() we have

qid = vhost_dev->tx_q[qid % vhost_dev->real_n_txq].map;

if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid == -1)) {


Should we change -1 to OVS_VHOST_QUEUE_MAP_UNKNOWN and handle also
OVS_VHOST_QUEUE_DISABLED?

On 24/03/2016 23:50, "Ilya Maximets"  wrote:

>According to QEMU documentation (docs/specs/vhost-user.txt) one queue
>should be enabled initially. More queues are enabled dynamically, by
>sending message VHOST_USER_SET_VRING_ENABLE.
>
>Currently all queues in OVS disabled by default. This breaks above
>specification. So, queue #0 should be enabled by default to support
>QEMU versions less than 2.5 and fix probable issues if QEMU will not
>send VHOST_USER_SET_VRING_ENABLE for queue #0 according to documentation.
>Also this will fix currently broken vhost-cuse support in OVS.
>
>Fixes: 585a5beaa2a4 ("netdev-dpdk: vhost-user: Fix sending packets to
>  queues not enabled by guest.")
>Reported-by: Mauricio Vasquez B
>
>Signed-off-by: Ilya Maximets 
>---
>
>version 2:
>   * Fixed initialization in netdev_dpdk_alloc_txq().
>   * Clearing moved to separate function.
>
> lib/netdev-dpdk.c | 26 +++---
> 1 file changed, 23 insertions(+), 3 deletions(-)
>
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index 7c4cd07..9b541cb 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -103,6 +103,9 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF /
>ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
> #define NIC_PORT_TX_Q_SIZE 2048  /* Size of Physical NIC TX Queue, Max
>(n+32<=4096)*/
> 
> #define OVS_VHOST_MAX_QUEUE_NUM 1024  /* Maximum number of vHost TX
>queues. */
>+#define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
>+#define OVS_VHOST_QUEUE_DISABLED(-2) /* Queue was disabled by guest
>and not
>+  * yet mapped to another queue.
>*/
> 
> static char *cuse_dev_name = NULL;/* Character device cuse_dev_name.
>*/
> static char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets
>*/
>@@ -671,7 +674,7 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *netdev,
>unsigned int n_txqs)
> }
> 
> /* Initialize map for vhost devices. */
>-netdev->tx_q[i].map = -1;
>+netdev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
> rte_spinlock_init(>tx_q[i].tx_lock);
> }
> }
>@@ -2019,7 +2022,7 @@ netdev_dpdk_remap_txqs(struct netdev_dpdk *netdev)
> }
> 
> if (n_enabled == 0 && total_txqs != 0) {
>-enabled_queues[0] = -1;
>+enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
> n_enabled = 1;
> }
> 
>@@ -2056,6 +2059,10 @@ netdev_dpdk_vhost_set_queues(struct netdev_dpdk
>*netdev, struct virtio_net *dev)
> netdev->real_n_rxq = qp_num;
> netdev->real_n_txq = qp_num;
> netdev->txq_needs_locking = true;
>+/* Enable TX queue 0 by default if it wasn't disabled. */
>+if (netdev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
>+netdev->tx_q[0].map = 0;
>+}
> 
> netdev_dpdk_remap_txqs(netdev);
> 
>@@ -2104,6 +2111,18 @@ new_device(struct virtio_net *dev)
> return 0;
> }
> 
>+/* Clears mapping for all available queues of vhost interface. */
>+static void
>+netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
>+OVS_REQUIRES(dev->mutex)
>+{
>+int i;
>+
>+for (i = 0; i < dev->real_n_txq; i++) {
>+dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
>+}
>+}
>+
> /*
>  * Remove a virtio-net device from the specific vhost port.  Use
>dev->remove
>  * flag to stop any more packets from being sent or received to/from a
>VM and
>@@ -2123,6 +2142,7 @@ destroy_device(volatile struct virtio_net *dev)
> ovs_mutex_lock(_dev->mutex);
> dev->flags &= ~VIRTIO_DEV_RUNNING;
> ovsrcu_set(_dev->virtio_dev, NULL);
>+netdev_dpdk_txq_map_clear(vhost_dev);
> exists = true;
> ovs_mutex_unlock(_dev->mutex);
> break;
>@@ -2169,7 +2189,7 @@ vring_state_changed(struct virtio_net *dev,
>uint16_t queue_id, int enable)
> if (enable) {
> vhost_dev->tx_q[qid].map = qid;
> } else {
>-vhost_dev->tx_q[qid].map = -1;
>+vhost_dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
> }
> netdev_dpdk_remap_txqs(vhost_dev);
> exists = true;
>-- 
>2.5.0
>

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v10 5/6] netdev-dpdk: Check dpdk-extra when reading db

2016-03-28 Thread Daniele Di Proietto


On 09/03/2016 09:38, "Aaron Conole"  wrote:

>A previous patch introduced the ability to pass arbitrary EAL command
>line options via the dpdk_extras database entry. This commit enhances
>that by warning the user when such a configuration is detected and
>prefering the value in the database.
>
>Suggested-by: Sean K Mooney 
>Signed-off-by: Aaron Conole 
>Tested-by: Sean K Mooney 
>Tested-by: Kevin Traynor 
>Acked-by: Panu Matilainen 
>Acked-by: Flavio Leitner 
>---
>v9:
>* Added as suggested by Sean K Mooney
>
>v10:
>* no change
>
> lib/netdev-dpdk.c | 66
>+--
> 1 file changed, 55 insertions(+), 11 deletions(-)
>
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index 4d3720f..2ca519d 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -2756,6 +2756,17 @@ dpdk_option_extend(char ***argv, int argc, const
>char *option,
> (*argv)[argc+1] = xstrdup(value);
> }
> 
>+static char **
>+move_argv(char ***argv, size_t cur_size, char **src_argv, size_t
>src_argc)
>+{
>+char **newargv = grow_argv(argv, cur_size, src_argc);
>+while(src_argc--) {

Space between while and (

>+newargv[cur_size+src_argc] = src_argv[src_argc];
>+src_argv[src_argc] = 0;
>+}
>+return newargv;
>+}
>+
> static int
> extra_dpdk_args(const char *ovs_cfg, char ***argv, int argc)
> {
>@@ -2773,9 +2784,21 @@ extra_dpdk_args(const char *ovs_cfg, char ***argv,
>int argc)
> return ret;
> }
> 
>+static bool
>+argv_contains(char **argv_haystack, const size_t argc_haystack,
>+  const char *needle)
>+{
>+for(size_t i = 0; i < argc_haystack; ++i) {

Space between for and (

>+if (!strcmp(argv_haystack[i], needle))
>+return true;
>+}
>+return false;
>+}
>+
> static int
> construct_dpdk_options(const struct ovsrec_open_vswitch *ovs_cfg,
>-   char ***argv, const int initial_size)
>+   char ***argv, const int initial_size,
>+   char **extra_args, const size_t extra_argc)
> {
> struct dpdk_options_map {
> const char *ovs_configuration;
>@@ -2797,8 +2820,13 @@ construct_dpdk_options(const struct
>ovsrec_open_vswitch *ovs_cfg,
> lookup = opts[i].default_value;
> 
> if(lookup) {

Space between if and (

>-dpdk_option_extend(argv, ret, opts[i].dpdk_option, lookup);
>-ret += 2;
>+if (!argv_contains(extra_args, extra_argc,
>opts[i].dpdk_option)) {
>+dpdk_option_extend(argv, ret, opts[i].dpdk_option,
>lookup);
>+ret += 2;
>+} else {
>+VLOG_WARN("Ignoring database defined option '%s' due to "
>+  "dpdk_extras config", opts[i].dpdk_option);
>+}
> }
> }
> 
>@@ -2807,7 +2835,8 @@ construct_dpdk_options(const struct
>ovsrec_open_vswitch *ovs_cfg,
> 
> static int
> construct_dpdk_mutex_options(const struct ovsrec_open_vswitch *ovs_cfg,
>- char ***argv, const int initial_size)
>+ char ***argv, const int initial_size,
>+ char **extra_args, const size_t extra_argc)
> {
> struct dpdk_exclusive_options_map {
> const char *category;
>@@ -2855,9 +2884,15 @@ construct_dpdk_mutex_options(const struct
>ovsrec_open_vswitch *ovs_cfg,
> ovs_abort(0, "Unable to cope with DPDK settings.");
> }
> 
>-dpdk_option_extend(argv, ret, popt->eal_dpdk_options[found_pos],
>-   found_value);
>-ret += 2;
>+if (!argv_contains(extra_args, extra_argc,
>+   popt->eal_dpdk_options[found_pos])) {
>+dpdk_option_extend(argv, ret,
>popt->eal_dpdk_options[found_pos],
>+   found_value);
>+ret += 2;
>+} else {
>+VLOG_WARN("Ignoring database defined option '%s' due to"
>+  " dpdk_extras config",
>popt->eal_dpdk_options[found_pos]);
>+}
> }
> 
> return ret;
>@@ -2868,14 +2903,23 @@ get_dpdk_args(const struct ovsrec_open_vswitch
>*ovs_cfg, char ***argv,
>   int argc)
> {
> const char *extra_configuration;
>-int i = construct_dpdk_options(ovs_cfg, argv, argc);
>-i = construct_dpdk_mutex_options(ovs_cfg, argv, i);
>+char **extra_args = NULL;
>+int i;
>+size_t extra_argc = 0;
> 
> extra_configuration = smap_get(_cfg->other_config, "dpdk-extra");
> if (extra_configuration) {
>-i = extra_dpdk_args(extra_configuration, argv, i);
>+extra_argc = extra_dpdk_args(extra_configuration, _args,
>0);
> }
>-return i;
>+
>+i = construct_dpdk_options(ovs_cfg, argv, argc, extra_args,
>extra_argc);
>+i = 

Re: [ovs-dev] [PATCH v10 2/6] netdev-dpdk: Convert initialization from cmdline to db

2016-03-28 Thread Daniele Di Proietto

I still have some comment:

dpdk-mem-channels: This is not required by DPDK anymore, so I still
think that's not necessary and could be removed.  If someone want

I think we shouldn't abort if we fail something during the initialization.
I know that rte_eal_init() can still abort, but I want to avoid it as
much as possible in OVS (It's still ok to abort for memory
allocation failure, as we do in the rest of OVS).  I've commented inline
when ovs_abort() is used.

Since we only use the "other_config" member of "struct
ovsrec_open_vswitch",
I would avoid passing the whole structure to dpdk_init().  We can avoid
including "vswitch-idl.h".

On 09/03/2016 09:38, "Aaron Conole"  wrote:

>Existing DPDK integration is provided by use of command line options which
>must be split out and passed to librte in a special manner. However, this
>forces any configuration to be passed by way of a special DPDK flag, and
>interferes with ovs+dpdk packaging solutions.
>
>This commit delays dpdk initialization until after the OVS database
>connection is established, at which point ovs initializes librte. It
>pulls all of the config data from the OVS database, and assembles a
>new argv/argc pair to be passed along.
>
>Signed-off-by: Aaron Conole 
>Tested-by: Sean K Mooney 
>Tested-by: RobertX Wojciechowicz 
>Tested-by: Kevin Traynor 
>Acked-by: Panu Matilainen 
>Acked-by: Kevin Traynor 
>Acked-by: Flavio Leitner 
>---
>v2:
>* Removed trailing whitespace
>* Followed for() loop brace coding style
>* Automatically enable DPDK when adding a DPDK enabled port
>* Fixed an issue on startup when DPDK enabled ports are present
>* Updated the documentation (including vswitch.xml) and documented all
>  new parameters
>* Dropped the premature initialization test
>
>v3:
>* Improved description language in INSTALL.DPDK.md
>* Fixed the ovs-vsctl examples for DPDK
>* Returned to the global dpdk-init (bullet 3 from v2)
>* Fixed a build error when compiling without dpdk support enabled
>* converted to xstrdup, for consistency after rebasing
>
>v4:
>* No change
>
>v5:
>* Adjust the ovs-dev script to account for the new dpdk configuration
>* Update the ovs-vswitchd.8.in pointing to INSTALL.DPDK.md
>
>v6:
>* Remove excess whitespace addition
>* Correct INSTALL.DPDK.md regarding when DPDK is initialized
>* Used incorrect variable in the non-dpdk case for testing against
>  dpdk
>
>v7:
>* Account for mutually exclusive options;
>
>v8:
>* ``make check`` testing revealed a number of flaws in the initialization
>  which resulted in memory corruption
>* Fixed the ovs-vswitchd startup during testing
>
>v9:
>* Re-arrange the added headers in netdev-dpdk.c to try and be alphabetical
>* Convert '-c' and '-n' options to be default non-inserted
>
>v10:
>* Documentation adjustment in vswitch.xml explicitly stating these values
>  are not runtime configurable.
>* Wrapped vhost_cuse_dev in #ifdef
>
> FAQ.md |   6 +-
> INSTALL.DPDK.md|  81 +---
> lib/netdev-dpdk.c  | 308
>+
> lib/netdev-dpdk.h  |  22 ++--
> tests/ofproto-macros.at|   3 +-
> utilities/ovs-dev.py   |  11 +-
> vswitchd/bridge.c  |   3 +
> vswitchd/ovs-vswitchd.8.in |   5 +-
> vswitchd/ovs-vswitchd.c|  25 +---
> vswitchd/vswitch.xml   | 131 ++-
> 10 files changed, 454 insertions(+), 141 deletions(-)
>
>diff --git a/FAQ.md b/FAQ.md
>index 8bd7ab9..018e6ae 100644
>--- a/FAQ.md
>+++ b/FAQ.md
>@@ -431,9 +431,9 @@ A: Yes.  How you configure it depends on what you
>mean by "promiscuous
> 
> A: Firstly, you must have a DPDK-enabled version of Open vSwitch.
> 
>-   If your version is DPDK-enabled it will support the --dpdk
>-   argument on the command line and will display lines with
>-   "EAL:..." during startup when --dpdk is supplied.
>+   If your version is DPDK-enabled it will support the
>other_config:dpdk-init
>+   configuration in the database and will display lines with
>+   "EAL:..." during startup when other_config:dpdk-init is set to 'true'.
> 
>Secondly, when adding a DPDK port, unlike a system port, the
>type for the interface must be specified. For example;
>diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
>index 1fc1b66..613764f 100644
>--- a/INSTALL.DPDK.md
>+++ b/INSTALL.DPDK.md
>@@ -143,22 +143,64 @@ Using the DPDK with ovs-vswitchd:
> 
> 5. Start vswitchd:
> 
>-   DPDK configuration arguments can be passed to vswitchd via `--dpdk`
>-   argument. This needs to be first argument passed to vswitchd process.
>-   dpdk arg -c is ignored by ovs-dpdk, but it is a required parameter
>-   for dpdk initialization.
>+   DPDK configuration arguments can be passed to vswitchd via
>Open_vSwitch
>+   other_config database. The recognized configuration options are


Re: [ovs-dev] [PATCH v10 1/6] netdev-dpdk: Restore thread affinity after DPDK init

2016-03-28 Thread Daniele Di Proietto
I'm ok with the idea and the implementation and I'm willing to apply
this patch despite the issue depicted below.

I just want to mention that with just this patch applied I experience
a lower throughput for the single flow phy test on my system (~14 Mpps
instead of 14.88 Mpps).

I thought that this was related to some weird scheduling condition, but
even booting with isocpus didn't help.

After some time I found that depending on the scheduling of the
revalidator threads, my CPU managed to reach different clock frequencies
(~2.8 Ghz when the throughput was low, ~2.4 Ghz when the throughput was
low).

I'm not an expert in this area, but I've been told that this may happen
because the CPU decides not to use TurboBoost.

We may want to consider this in the future, when we reconsider how
to pin/spawn non pmd threads.

Thanks,

Daniele

On 09/03/2016 09:38, "Aaron Conole"  wrote:

>When the DPDK init function is called, it changes the executing thread's
>CPU affinity to a single core specified in -c. This will result in the
>userspace bridge configuration thread being rebound, even if that is not
>the intent.
>
>This change fixes that behavior by rebinding to the original thread
>affinity after calling dpdk_init().
>
>Signed-off-by: Kevin Traynor 
>Signed-off-by: Aaron Conole 
>Tested-by: RobertX Wojciechowicz 
>Tested-by: Sean K Mooney 
>Acked-by: Panu Matilainen 
>Acked-by: Flavio Leitner 
>---
>v2:
>* Removed trailing whitespace
>
>v3->v10:
>* No change
>
> lib/netdev-dpdk.c | 18 ++
> 1 file changed, 18 insertions(+)
>
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index 0233b3c..d44cf46 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -2739,6 +2739,9 @@ dpdk_init(int argc, char **argv)
> int result;
> int base = 0;
> char *pragram_name = argv[0];
>+int err;
>+int isset;
>+cpu_set_t cpuset;
> 
> if (argc < 2 || strcmp(argv[1], "--dpdk"))
> return 0;
>@@ -2780,6 +2783,14 @@ dpdk_init(int argc, char **argv)
> base = 2;
> }
> 
>+/* Get the main thread affinity */
>+CPU_ZERO();
>+err = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t),
>);
>+if (err) {
>+VLOG_ERR("Thread getaffinity error %d.", err);
>+return err;
>+}
>+
> /* Keep the program name argument as this is needed for call to
>  * rte_eal_init()
>  */
>@@ -2791,6 +2802,13 @@ dpdk_init(int argc, char **argv)
> ovs_abort(result, "Cannot init EAL");
> }
> 
>+/* Set the main thread affinity back to pre rte_eal_init() value */
>+err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t),
>);
>+if (err) {
>+VLOG_ERR("Thread setaffinity error %d", err);
>+return err;
>+}
>+
> rte_memzone_dump(stdout);
> rte_eal_init_ret = 0;
> 
>-- 
>2.5.0

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


Re: [ovs-dev] [PATCH v10 4/6] netdev-dpdk: Allow arbitrary eal arguments

2016-03-28 Thread Daniele Di Proietto


On 09/03/2016 10:38, "Aaron Conole"  wrote:

>A previous change moved some commonly used arguments from commandline to
>the database, and with it the ability to pass arbitrary arguments to
>EAL. This change allows arbitrary eal arguments to be provided
>via a new db entry 'other_config:dpdk-extra' which will tokenize the
>string and add it to the argument list. The only argument which will not
>be supported with this change is '--no-huge', which appears to break the
>system in other ways.
>
>Signed-off-by: Aaron Conole 
>Tested-by: Sean K Mooney 
>Tested-by: RobertX Wojciechowicz 
>Tested-by: Kevin Traynor 
>Acked-by: Panu Matilainen 
>Acked-by: Kevin Traynor 
>Acked-by: Flavio Leitner 
>---
>v4:
>* Added by suggestion of Panu, making socket-mem non-default
>
>v5:
>* Keep the socket-mem as default parameter, and mention that we
>  do not support --no-huge
>* Update ovs-dev.py with the new mechanism for passing arbitrary dpdk
>  options
>
>v6->v9:
>* No change
>
>v10:
>* INSTALL.DPDK.md - removed the note since a future commit in the series
>makes
>  that documentation invalid (and it seems silly to add it here, only to
>remove
>  in in the next commit)
>
> INSTALL.DPDK.md  |  5 +
> lib/netdev-dpdk.c| 49
>+++--
> utilities/ovs-dev.py |  6 --
> vswitchd/vswitch.xml | 11 +++
> 4 files changed, 55 insertions(+), 16 deletions(-)
>
>diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
>index 613764f..3b3d3a0 100644
>--- a/INSTALL.DPDK.md
>+++ b/INSTALL.DPDK.md
>@@ -178,6 +178,11 @@ Using the DPDK with ovs-vswitchd:
>* dpdk-hugepage-dir
>Directory where hugetlbfs is mounted
> 
>+   * dpdk-extra
>+   Extra arguments to provide to DPDK EAL, as previously specified on the
>+   command line. Do not pass '--no-huge' to the system in this way.
>Support
>+   for running the system without hugepages is nonexistent.
>+
>* cuse-dev-name
>Option to set the vhost_cuse character device name.
> 
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index 289f916..4d3720f 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -2739,6 +2739,9 @@ static char **
> grow_argv(char ***argv, size_t cur_siz, size_t grow_by)
> {
> char **new_argv = realloc(*argv, sizeof(char *) * (cur_siz +
>grow_by));
>+if (!new_argv) {
>+ovs_abort(0, "grow_argv() failed to allocate memory.");
>+}

No need to check if we use xrealloc()

> return new_argv;
> }
> 
>@@ -2748,16 +2751,29 @@ dpdk_option_extend(char ***argv, int argc, const
>char *option,
> {
> char **newargv = grow_argv(argv, argc, 2);
> 
>-if (!newargv) {
>-ovs_abort(0, "grow_argv() failed to allocate memory.");
>-}
>-
> *argv = newargv;
> (*argv)[argc]   = xstrdup(option);
> (*argv)[argc+1] = xstrdup(value);
> }
> 
> static int
>+extra_dpdk_args(const char *ovs_cfg, char ***argv, int argc)

Would you mind using another name for "ovs_cfg", please?

It's used elsewhere with another meaning

>+{
>+int ret = argc;
>+char *release_tok = xstrdup(ovs_cfg);
>+char *tok = release_tok, *endptr = NULL;
>+
>+for(tok = strtok_r(release_tok, " ", ); tok != NULL;

Space between for and (

>+tok = strtok_r(NULL, " ", )) {
>+char **newarg = grow_argv(argv, ret, 1);
>+*argv = newarg;
>+(*argv)[ret++] = xstrdup(tok);

I'd use "newarg" instead of "(*argv)"

>+}
>+free(release_tok);
>+return ret;
>+}
>+
>+static int
> construct_dpdk_options(const struct ovsrec_open_vswitch *ovs_cfg,
>char ***argv, const int initial_size)
> {
>@@ -2851,8 +2867,14 @@ static int
> get_dpdk_args(const struct ovsrec_open_vswitch *ovs_cfg, char ***argv,
>   int argc)
> {
>+const char *extra_configuration;
> int i = construct_dpdk_options(ovs_cfg, argv, argc);
> i = construct_dpdk_mutex_options(ovs_cfg, argv, i);
>+
>+extra_configuration = smap_get(_cfg->other_config, "dpdk-extra");
>+if (extra_configuration) {
>+i = extra_dpdk_args(extra_configuration, argv, i);
>+}
> return i;
> }
> 
>@@ -2911,17 +2933,15 @@ __dpdk_init(const struct ovsrec_open_vswitch
>*ovs_cfg)
> }
> 
> argv = grow_argv(, argc, argc+1);
>-if (!argv) {
>-ovs_abort(0, "Unable to allocate an initial argv.");
>-}
> argv[argc++] = xstrdup("ovs"); /* TODO use prctl to get process name
>*/
> argc_tmp = get_dpdk_args(ovs_cfg, , argc);
> 
> while(argc_tmp != argc) {
>-if (!strcmp("-c", argv[argc++])) {
>+if (!strcmp("-c", argv[argc]) || !strcmp("-l", argv[argc])) {
> auto_determine = false;
> break;
> }
>+argc++;
> }
> argc = argc_tmp;
> 
>@@ -2936,9 +2956,6 @@ __dpdk_init(const struct ovsrec_open_vswitch
>*ovs_cfg)
>   

Re: [ovs-dev] [PATCH v10 3/6] netdev-dpdk: Autofill lcore coremask if absent

2016-03-28 Thread Daniele Di Proietto


On 09/03/2016 09:38, "Aaron Conole"  wrote:

>The user has control over the DPDK internal lcore coremask, but this
>parameter can be autofilled with a bit more intelligence. If the user
>does not fill this parameter in, we use the lowest set bit in the
>current task CPU affinity. Otherwise, we will reassign the current
>thread to the specified lcore mask, in addition to the dpdk lcore
>threads.
>
>Signed-off-by: Aaron Conole 
>Tested-by: Sean K Mooney 
>Tested-by: RobertX Wojciechowicz 
>Tested-by: Kevin Traynor 
>Acked-by: Panu Matilainen 
>Acked-by: Kevin Traynor 
>Acked-by: Flavio Leitner 
>---
>v2:
>* Fix a conditional branch coding standard issue
>* When lcore coremask is set, do not reset the affinities as
>  suggested by Kevin Traynor
>
>v3:
>* Fixed grow_argv calls
>* Fixed an error in argc assignment after 'break;' introduced
>  in v2
>* Switched to using xstrdup
>
>v4->v7:
>* No change
>
>v8:
>* Assign the lcore only when resetting the affinity.
>
>v9,v10:
>* No change
>
> lib/netdev-dpdk.c | 62
>++-
> 1 file changed, 48 insertions(+), 14 deletions(-)
>
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index 7e8e72e..289f916 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -68,6 +68,7 @@ static struct vlog_rate_limit rl =
>VLOG_RATE_LIMIT_INIT(5, 20);
> #define OVS_VPORT_DPDK "ovs_dpdk"
> 
> #define MAX_DPDK_EXCL_OPTS 10
>+#define MAX_BUFSIZ 256

I would remove this, please see below

> 
> /*
>  * need to reserve tons of extra space in the mbufs so we can align the
>@@ -2770,7 +2771,6 @@ construct_dpdk_options(const struct
>ovsrec_open_vswitch *ovs_cfg,
> {"dpdk-mem-channels", "-n", false, "4"},
> {"dpdk-hugepage-dir", "--huge-dir", false, NULL},
> };
>-
> int i, ret = initial_size;
> 
> /*First, construct from the flat-options (non-mutex)*/
>@@ -2848,9 +2848,10 @@ construct_dpdk_mutex_options(const struct
>ovsrec_open_vswitch *ovs_cfg,
> }
> 
> static int
>-get_dpdk_args(const struct ovsrec_open_vswitch *ovs_cfg, char ***argv)
>+get_dpdk_args(const struct ovsrec_open_vswitch *ovs_cfg, char ***argv,
>+  int argc)
> {
>-int i = construct_dpdk_options(ovs_cfg, argv, 1);
>+int i = construct_dpdk_options(ovs_cfg, argv, argc);
> i = construct_dpdk_mutex_options(ovs_cfg, argv, i);
> return i;
> }
>@@ -2874,7 +2875,8 @@ __dpdk_init(const struct ovsrec_open_vswitch
>*ovs_cfg)
> {
> char **argv = NULL;
> int result;
>-int argc;
>+int argc = 0, argc_tmp;
>+bool auto_determine = true;
> int err;
> cpu_set_t cpuset;
> 
>@@ -2908,12 +2910,41 @@ __dpdk_init(const struct ovsrec_open_vswitch
>*ovs_cfg)
> ovs_abort(0, "Thread getaffinity error %d.", err);
> }
> 
>-argv = grow_argv(, 0, 1);
>+argv = grow_argv(, argc, argc+1);

I think "1" is better than "argc+1"

> if (!argv) {
> ovs_abort(0, "Unable to allocate an initial argv.");
> }
>-argv[0] = xstrdup("ovs"); /* TODO use prctl to get process name */
>-argc = get_dpdk_args(ovs_cfg, );
>+argv[argc++] = xstrdup("ovs"); /* TODO use prctl to get process name
>*/
>+argc_tmp = get_dpdk_args(ovs_cfg, , argc);
>+
>+while(argc_tmp != argc) {
>+if (!strcmp("-c", argv[argc++])) {
>+auto_determine = false;
>+break;
>+}
>+}
>+argc = argc_tmp;
>+
>+/**
>+ * NOTE: This is an unsophisticated mechanism for determining the
>DPDK
>+ * lcore for the DPDK Master.
>+ */
>+if (auto_determine) {
>+int i;
>+for (i = 0; i < CPU_SETSIZE; i++) {
>+if (CPU_ISSET(i, )) {
>+char buf[MAX_BUFSIZ];
>+snprintf(buf, MAX_BUFSIZ, "0x%08llX", (1ULL<+argv = grow_argv(, argc, 2);
>+if (!argv) {
>+ovs_abort(0, "Unable to grow argv for coremask");
>+}
>+argv[argc++] = xstrdup("-c");
>+argv[argc++] = xstrdup(buf);
>+i = CPU_SETSIZE;
>+}
>+}
>+}
> 
> argv = grow_argv(, argc, 1);
> if (!argv) {
>@@ -2929,10 +2960,16 @@ __dpdk_init(const struct ovsrec_open_vswitch
>*ovs_cfg)
> ovs_abort(result, "Cannot init EAL");
> }
> 
>-/* Set the main thread affinity back to pre rte_eal_init() value */
>-err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t),
>);
>-if (err) {
>-ovs_abort(0, "Thread getaffinity error %d.", err);
>+if (auto_determine) {
>+/* Set the main thread affinity back to pre rte_eal_init() value
>*/
>+err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t),
>+ );
>+if (err) {
>+

Re: [ovs-dev] [PATCH v10 0/6] Convert DPDK configuration from command line to DB based

2016-03-28 Thread Daniele Di Proietto
Hi Aaron,

apologies for the delay.  I'm finally convinced that this is the
best way for OVS to initialize DPDK.

I've put some comments on the patches, mostly minor fixes, other than
those I think this is ready to be merged

Thanks for all your work,

Daniele

>> Aaron Conole  writes:
>>> Currently, configuration of DPDK parameters is done via the command
>>>line
>>> through a --dpdk **OPTIONS** -- command line argument. This has a
>>>number of
>>> challenges, including:
>>> * It must be the first option passed to ovs-vswitchd
>>> * It is the only datapath feature in OVS to be configured on the
>>>command line
>>> * It requires specialized knowledge of sub-component command switches
>>> * It also inteprets non-EAL arguments (confusing users)
>>> * It is a broken model.
>>>
>>>
>>> This series brings the following changes to openvswitch:
>>> * All DPDK options are taken from the ovs database rather than the
>>>   command line
>>> * Non-EAL arguments also have separate database entries
>>> * DPDK lcores are optionally auto-assigned to a single core based on
>>>the
>>>   bridge coremask.
>>> * DPDK options have default behaviors
>>> * Updated documentation
>>>
>>> v2:
>>> * Dropped the vhost-user socket configuration options. Those can be
>>>re-added
>>>   as an extension
>>> * Incorporated feedback from Kevin Traynor.
>>>
>>> v3:
>>> * Went back to a global dpdk-init
>>> * Language cleanup and various minor fixes
>>>
>>> v4:
>>> * Added a way to pass arbitrary eal arguments
>>>
>>> v5:
>>> * Restore the socket-mem default, and fix up the ovs-dev.py script,
>>>along
>>>   with the manpage for ovsdb-server
>>>
>>> v6:
>>> * Correct a documentation issue with INSTALL.DPDK.md
>>> * Correct a non-dpdk enabled OVS incorrect warning variable
>>> * Remove an excess whitespace
>>>
>>> v7:
>>> * After testing by Christian with dpdk-alloc-mem
>>>
>>> v8:
>>> * Confirmed ``make check`` operation with and without dpdk.
>>>   Retested on live-host
>>>
>>> v9:
>>> * Cleanup of comments
>>> * Cleanup of one place where headers are specified
>>> * Mark the dpdk coremask and numa config as optional
>>> * Added 5/6 to scan the extras and warn the user when conflicting
>>>   DB entries are present
>>> * Acks given for all but patch 5/6
>>>
>>> v10:
>>> * Rebased against latest upstream
>>> * ACK or Tested-by for all patches
>>> * Code cleanup on patch 2/6 (vhost-cuse warning)
>>> * DB options documentation cleanup.
>>>
>>> Aaron Conole (6):
>>>   netdev-dpdk: Restore thread affinity after DPDK init
>>>   netdev-dpdk: Convert initialization from cmdline to db
>>>   netdev-dpdk: Autofill lcore coremask if absent
>>>   netdev-dpdk: Allow arbitrary eal arguments
>>>   netdev-dpdk: Check dpdk-extra when reading db
>>>   NEWS: Announce the DPDK EAL configuration change
>>>
>>>  FAQ.md |   6 +-
>>>  INSTALL.DPDK.md|  86 +++---
>>>  NEWS   |   3 +
>>>  lib/netdev-dpdk.c  | 409
>>>+
>>>  lib/netdev-dpdk.h  |  22 ++-
>>>  tests/ofproto-macros.at|   3 +-
>>>  utilities/ovs-dev.py   |   7 +-
>>>  vswitchd/bridge.c  |   3 +
>>>  vswitchd/ovs-vswitchd.8.in |   5 +-
>>>  vswitchd/ovs-vswitchd.c|  25 +--
>>>  vswitchd/vswitch.xml   | 142 +++-
>>>  11 files changed, 581 insertions(+), 130 deletions(-)
>> ___
>> dev mailing list
>> dev@openvswitch.org
>> http://openvswitch.org/mailman/listinfo/dev

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v6 10/12] dpif-netdev: Fix reconfigure_pmd_threads().

2016-03-28 Thread Daniele Di Proietto
This commit changes reconfigure_pmd_threads() to interact with the ports
cmap using RCU semantics (the content of the port structure is not
altered while concurrent readers might access it) and to fail more
gracefully in case of a set_multiq fail (now we remove the port from the
datapath, instead of returning prematurely from the function without
restarting the pmd threads).

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
Tested-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/dpif-netdev.c | 82 +++
 1 file changed, 59 insertions(+), 23 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c66bf29..beca9eb 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2591,6 +2591,11 @@ static void
 reconfigure_pmd_threads(struct dp_netdev *dp)
 {
 struct dp_netdev_port *port;
+struct hmapx to_reconfigure = HMAPX_INITIALIZER(_reconfigure);
+struct hmapx_node *node;
+bool failed_config = false;
+
+ovs_mutex_lock(>port_mutex);
 
 dp_netdev_destroy_all_pmds(dp);
 
@@ -2599,33 +2604,64 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
 int requested_n_rxq = netdev_requested_n_rxq(netdev);
 if (netdev_is_pmd(port->netdev)
 && port->latest_requested_n_rxq != requested_n_rxq) {
-int i, err;
+cmap_remove(>ports, >node, hash_odp_port(port->port_no));
+hmapx_add(_reconfigure, port);
+}
+}
+ovs_mutex_unlock(>port_mutex);
 
-/* Closes the existing 'rxq's. */
-for (i = 0; i < port->n_rxq; i++) {
-netdev_rxq_close(port->rxq[i]);
-port->rxq[i] = NULL;
-}
-port->n_rxq = 0;
-
-/* Sets the new rx queue config. */
-err = netdev_set_multiq(port->netdev, ovs_numa_get_n_cores() + 1,
-requested_n_rxq);
-if (err && (err != EOPNOTSUPP)) {
-VLOG_ERR("Failed to set dpdk interface %s rx_queue to: %u",
- netdev_get_name(port->netdev),
- requested_n_rxq);
-return;
-}
-port->latest_requested_n_rxq = requested_n_rxq;
-/* If the set_multiq() above succeeds, reopens the 'rxq's. */
-port->n_rxq = netdev_n_rxq(port->netdev);
-port->rxq = xrealloc(port->rxq, sizeof *port->rxq * port->n_rxq);
-for (i = 0; i < port->n_rxq; i++) {
-netdev_rxq_open(port->netdev, >rxq[i], i);
+/* Waits for the other threads to see the ports removed from the cmap,
+ * otherwise we are not allowed to alter them. */
+ovsrcu_synchronize();
+
+ovs_mutex_lock(>port_mutex);
+HMAPX_FOR_EACH (node, _reconfigure) {
+int requested_n_rxq, i, err;
+
+port = node->data;
+requested_n_rxq = netdev_requested_n_rxq(port->netdev);
+/* Closes the existing 'rxq's. */
+for (i = 0; i < port->n_rxq; i++) {
+netdev_rxq_close(port->rxq[i]);
+port->rxq[i] = NULL;
+}
+port->n_rxq = 0;
+
+/* Sets the new rx queue config. */
+err = netdev_set_multiq(port->netdev, ovs_numa_get_n_cores() + 1,
+requested_n_rxq);
+if (err && (err != EOPNOTSUPP)) {
+VLOG_ERR("Failed to set dpdk interface %s rx_queue to: %u",
+ netdev_get_name(port->netdev),
+ requested_n_rxq);
+do_destroy_port(port);
+failed_config = true;
+continue;
+}
+port->latest_requested_n_rxq = requested_n_rxq;
+/* If the netdev_set_multiq() above succeeds, reopens the 'rxq's and
+ * inserts the port back in the cmap, to allow transmitting packets. */
+port->rxq = xrealloc(port->rxq, sizeof *port->rxq
+* netdev_n_rxq(port->netdev));
+for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
+err = netdev_rxq_open(port->netdev, >rxq[i], i);
+if (err) {
+do_destroy_port(port);
+failed_config = true;
+continue;
 }
+port->n_rxq++;
 }
+cmap_insert(>ports, >node, hash_port_no(port->port_no));
 }
+ovs_mutex_unlock(>port_mutex);
+
+hmapx_destroy(_reconfigure);
+
+if (failed_config) {
+seq_change(dp->port_seq);
+}
+
 /* Reconfigures the cpu mask. */
 ovs_numa_set_cpu_mask(dp->requested_pmd_cmask);
 free(dp->pmd_cmask);
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v6 07/12] ofproto-dpif: Call dpif_poll_threads_set() before dpif_run()

2016-03-28 Thread Daniele Di Proietto
An upcoming commit will make dpif_poll_threads_set() record the
requested configuration and dpif_run() apply it, so it makes sense to
change the order.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
Tested-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Ilya Maximets <i.maxim...@samsung.com>
---
 ofproto/ofproto-dpif.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 6182ec2..a364f6c 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -536,6 +536,8 @@ type_run(const char *type)
 return 0;
 }
 
+/* This must be called before dpif_run() */
+dpif_poll_threads_set(backer->dpif, pmd_cpu_mask);
 
 if (dpif_run(backer->dpif)) {
 backer->need_revalidate = REV_RECONFIGURE;
@@ -564,8 +566,6 @@ type_run(const char *type)
 udpif_set_threads(backer->udpif, n_handlers, n_revalidators);
 }
 
-dpif_poll_threads_set(backer->dpif, pmd_cpu_mask);
-
 if (backer->need_revalidate) {
 struct ofproto_dpif *ofproto;
 struct simap_node *node;
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v6 09/12] dpif-netdev: Document locking discipline for non_pmd_mutex.

2016-03-28 Thread Daniele Di Proietto
This just documents what the current code already assumes.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
Tested-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/dpif-netdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 6aaeaeb..c66bf29 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -184,6 +184,7 @@ static bool dpcls_lookup(const struct dpcls *cls,
  * Acquisition order is, from outermost to innermost:
  *
  *dp_netdev_mutex (global)
+ *non_pmd_mutex
  *port_mutex
  */
 struct dp_netdev {
-- 
2.1.4

___
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


[ovs-dev] [PATCH v6 11/12] netdev: Add reconfigure request mechanism.

2016-03-28 Thread Daniele Di Proietto
A netdev provider, especially a PMD provider (like netdev DPDK) might
not be able to change some of its parameters (such as MTU, or number of
queues) without stopping everything and restarting.

This commit introduces a mechanism that allows a netdev provider to
request a restart (netdev_request_reconfigure()).  The upper layer can
be notified via netdev_wait_reconf_required() and
netdev_is_reconf_required().  After closing all the rxqs the upper layer
can finally call netdev_reconfigure(), to make sure that the new
configuration is in place.

This will be used by next commit to reconfigure rx and tx queues in
netdev-dpdk.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
Tested-by: Ilya Maximets <i.maxim...@samsung.com>
Acked-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/netdev-bsd.c  |  1 +
 lib/netdev-dpdk.c |  1 +
 lib/netdev-dummy.c|  1 +
 lib/netdev-linux.c|  1 +
 lib/netdev-provider.h | 27 ++-
 lib/netdev-vport.c|  1 +
 lib/netdev.c  | 38 ++
 lib/netdev.h  |  4 
 8 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
index 75bd5a3..7a430aa 100644
--- a/lib/netdev-bsd.c
+++ b/lib/netdev-bsd.c
@@ -1537,6 +1537,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum 
netdev_flags off,
 netdev_bsd_arp_lookup, /* arp_lookup */  \
  \
 netdev_bsd_update_flags, \
+NULL, /* reconfigure */  \
  \
 netdev_bsd_rxq_alloc,\
 netdev_bsd_rxq_construct,\
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index f8c8ddc..7a6e2db 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2695,6 +2695,7 @@ static const struct dpdk_qos_ops egress_policer_ops = {
 NULL,   /* arp_lookup */  \
   \
 netdev_dpdk_update_flags, \
+NULL,   /* reconfigure */ \
   \
 netdev_dpdk_rxq_alloc,\
 netdev_dpdk_rxq_construct,\
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index 8be1ba7..2d3cc3b 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -1276,6 +1276,7 @@ static const struct netdev_class dummy_class = {
 NULL,   /* arp_lookup */
 
 netdev_dummy_update_flags,
+NULL,   /* reconfigure */
 
 netdev_dummy_rxq_alloc,
 netdev_dummy_rxq_construct,
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 994a27c..7568404 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -2807,6 +2807,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum 
netdev_flags off,
 netdev_linux_arp_lookup,\
 \
 netdev_linux_update_flags,  \
+NULL,   /* reconfigure */   \
 \
 netdev_linux_rxq_alloc, \
 netdev_linux_rxq_construct, \
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index 4419629..ef2da98 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -52,6 +52,16 @@ struct netdev {
  * 'netdev''s flags, features, ethernet address, or carrier changes. */
 uint64_t change_seq;
 
+/* A netdev provider might be unable to change some of the device's
+ * parameter (n_rxq, mtu) when the device is in use.  In this case
+ * the provider can notify the upper layer by calling
+ * netdev_request_reconfigure().  The upper layer will react by stopping
+ * the operations on the device and calling netdev_reconfigure() to allow
+ * the configuration changes.  'last_reconfigure_seq' remembers the value
+ * of 'reconfigure_seq' when the last reconfiguration happened. */
+struct seq *reconfigure_seq;
+uint64_t last_reconfigure_seq;
+
 /* The core netdev code initializes these at netdev construction and only
  * provide read-only access to its client.  Netdev implementations may
  * modify them. */
@@ -64,7 +74,7 @@ struct netdev {
 struct ovs_list saved_flags_list; /* Contains "struct netdev_saved_flags". 
*/
 };
 
-static void
+static inline void
 netdev_change_seq_changed(const struct netdev *netdev_)
 {
 struct netdev *netdev = CONST_CAST(struct netdev *, netdev_);
@@ -75,6 +85,12 @@ netdev_change_seq_changed(const struct netdev *netdev_)
 }
 }
 
+static inline void

Re: [ovs-dev] [PATCH v5 08/12] dpif-netdev: Change pmd thread configuration in dpif_netdev_run().

2016-03-28 Thread Daniele Di Proietto
Hi Mark,

thanks for your comment, I replied inline

On 24/03/2016 10:17, "Kavanagh, Mark B" <mark.b.kavan...@intel.com> wrote:

>Hi Daniele,
>
>One comment inline.
>
>Cheers,
>Mark
>
>>-----Original Message-
>>From: Daniele Di Proietto [mailto:diproiet...@vmware.com]
>>Sent: Wednesday, March 23, 2016 6:37 PM
>>To: dev@openvswitch.org
>>Cc: Ben Pfaff; Kavanagh, Mark B; Ilya Maximets; Daniele Di Proietto
>>Subject: [PATCH v5 08/12] dpif-netdev: Change pmd thread configuration
>>in dpif_netdev_run().
>>
>>Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
>>Tested-by: Ilya Maximets <i.maxim...@samsung.com>
>>Acked-by: Ilya Maximets <i.maxim...@samsung.com>
>>---
>> lib/dpif-netdev.c   | 140
>>++--
>> lib/dpif-provider.h |   3 +-
>> 2 files changed, 83 insertions(+), 60 deletions(-)
>>
>>diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>index 66c0b19..6aaeaeb 100644
>>--- a/lib/dpif-netdev.c
>>+++ b/lib/dpif-netdev.c
>>@@ -223,7 +223,9 @@ struct dp_netdev {
>> ovsthread_key_t per_pmd_key;
>>
>> /* Cpu mask for pin of pmd threads. */
>>+char *requested_pmd_cmask;
>> char *pmd_cmask;
>>+
>> uint64_t last_tnl_conf_seq;
>> };
>>
>>@@ -2447,82 +2449,44 @@ dpif_netdev_operate(struct dpif *dpif, struct
>>dpif_op **ops, size_t
>>n_ops)
>> }
>> }
>>
>>-/* Returns true if the configuration for rx queues or cpu mask
>>- * is changed. */
>>+/* Returns true if the configuration for rx queues is changed. */
>> static bool
>>-pmd_config_changed(const struct dp_netdev *dp, const char *cmask)
>>+pmd_n_rxq_changed(const struct dp_netdev *dp)
>> {
>> struct dp_netdev_port *port;
>>
>> CMAP_FOR_EACH (port, node, >ports) {
>>-struct netdev *netdev = port->netdev;
>>-int requested_n_rxq = netdev_requested_n_rxq(netdev);
>>-if (netdev_is_pmd(netdev)
>>+int requested_n_rxq = netdev_requested_n_rxq(port->netdev);
>>+
>>+if (netdev_is_pmd(port->netdev)
>> && port->latest_requested_n_rxq != requested_n_rxq) {
>> return true;
>> }
>> }
>>
>>-if (dp->pmd_cmask != NULL && cmask != NULL) {
>>-return strcmp(dp->pmd_cmask, cmask);
>>-} else {
>>-return (dp->pmd_cmask != NULL || cmask != NULL);
>>+return false;
>>+}
>>+
>>+static bool
>>+cmask_equals(const char *a, const char *b)
>>+{
>>+if (a && b) {
>>+return !strcmp(a, b);
>> }
>>+
>>+return a == NULL && b == NULL;
>> }
>>
>>-/* Resets pmd threads if the configuration for 'rxq's or cpu mask
>>changes. */
>>+/* Changes the number or the affinity of pmd threads.  The changes are
>>actually
>>+ * applied in dpif_netdev_run(). */
>> static int
>> dpif_netdev_pmd_set(struct dpif *dpif, const char *cmask)
>> {
>> struct dp_netdev *dp = get_dp_netdev(dpif);
>>
>>-if (pmd_config_changed(dp, cmask)) {
>>-struct dp_netdev_port *port;
>>-
>>-dp_netdev_destroy_all_pmds(dp);
>>-
>>-CMAP_FOR_EACH (port, node, >ports) {
>>-struct netdev *netdev = port->netdev;
>>-int requested_n_rxq = netdev_requested_n_rxq(netdev);
>>-if (netdev_is_pmd(port->netdev)
>>-&& port->latest_requested_n_rxq != requested_n_rxq) {
>>-int i, err;
>>-
>>-/* Closes the existing 'rxq's. */
>>-for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
>>-netdev_rxq_close(port->rxq[i]);
>>-port->rxq[i] = NULL;
>>-}
>>-port->n_rxq = 0;
>>-
>>-/* Sets the new rx queue config.  */
>>-err = netdev_set_multiq(port->netdev,
>>-ovs_numa_get_n_cores() + 1,
>>-requested_n_rxq);
>>-if (err && (err != EOPNOTSUPP)) {
>>-VLOG_ERR("Failed to set dpdk interface %s rx_queue
>>to:"
>>- " %u", netdev_get_name(port->netdev),
>>- requested_n_rxq);
>>-return e

[ovs-dev] [PATCH v6 06/12] dpif-netdev: Wait an RCU grace period before freeing ports.

2016-03-28 Thread Daniele Di Proietto
The dpif-netdev datapath keeps ports in a cmap which is written only by
the main thread (holding port_mutex), but which is read concurrently by
many threads (most notably the pmd threads).

When removing ports from the datapath we should postpone the deletion,
otherwise another thread might access invalid memory while reading the
cmap.

This commit splits do_port_del() in do_port_remove() and
do_port_destroy(): the former removes the port from the cmap, while the
latter reclaims the memory and drops the reference to the underlying
netdev.

dpif_netdev_del_port() now uses ovsrcu_synchronize() before calling
do_port_destroy(), to avoid memory corruption in concurrent readers.

I've not been able to reproduce the bug in practice, since when the
datapath modifies its ports its stops (or locks out) most of the threads
than may access the cmap.  This change is done for two reasons:
* Using RCU is more in line with other cmap users.
* We might want to allow port removal and queue reconfiguration without
  stopping completely all the pmd threads.

Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
Acked-by: Ilya Maximets <i.maxim...@samsung.com>
---
 lib/dpif-netdev.c | 120 --
 1 file changed, 80 insertions(+), 40 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 342476a..66c0b19 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -469,8 +469,9 @@ static void dp_netdev_free(struct dp_netdev *)
 static int do_add_port(struct dp_netdev *dp, const char *devname,
const char *type, odp_port_t port_no)
 OVS_REQUIRES(dp->port_mutex);
-static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
+static void do_remove_port(struct dp_netdev *dp, struct dp_netdev_port *)
 OVS_REQUIRES(dp->port_mutex);
+static void do_destroy_port(struct dp_netdev_port *);
 static int dpif_netdev_open(const struct dpif_class *, const char *name,
 bool create, struct dpif **);
 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
@@ -495,6 +496,7 @@ static struct dp_netdev_pmd_thread 
*dp_netdev_get_pmd(struct dp_netdev *dp,
 static struct dp_netdev_pmd_thread *
 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
+static bool has_pmd_port_for_numa(struct dp_netdev *dp, int numa_id);
 static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
 static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id);
 static void dp_netdev_pmd_clear_poll_list(struct dp_netdev_pmd_thread *pmd);
@@ -969,7 +971,8 @@ static void
 dp_netdev_free(struct dp_netdev *dp)
 OVS_REQUIRES(dp_netdev_mutex)
 {
-struct dp_netdev_port *port;
+struct dp_netdev_port *port, **port_list;
+size_t n_ports, k;
 
 shash_find_and_delete(_netdevs, dp->name);
 
@@ -978,11 +981,25 @@ dp_netdev_free(struct dp_netdev *dp)
 ovsthread_key_delete(dp->per_pmd_key);
 
 ovs_mutex_lock(>port_mutex);
+n_ports = cmap_count(>ports);
+port_list = xcalloc(n_ports, sizeof *port_list);
+
+k = 0;
 CMAP_FOR_EACH (port, node, >ports) {
-/* PMD threads are destroyed here. do_del_port() cannot quiesce */
-do_del_port(dp, port);
+do_remove_port(dp, port);
+ovs_assert(k < n_ports);
+port_list[k++] = port;
 }
 ovs_mutex_unlock(>port_mutex);
+
+ovsrcu_synchronize();
+
+for (size_t i = 0; i < k; i++) {
+do_destroy_port(port_list[i]);
+}
+
+free(port_list);
+
 cmap_destroy(>poll_threads);
 
 seq_destroy(dp->port_seq);
@@ -1226,22 +1243,49 @@ static int
 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
 {
 struct dp_netdev *dp = get_dp_netdev(dpif);
+struct dp_netdev_port *port = NULL;
 int error;
 
 ovs_mutex_lock(>port_mutex);
 if (port_no == ODPP_LOCAL) {
 error = EINVAL;
 } else {
-struct dp_netdev_port *port;
 
 error = get_port_by_number(dp, port_no, );
 if (!error) {
-do_del_port(dp, port);
+do_remove_port(dp, port);
 }
 }
 ovs_mutex_unlock(>port_mutex);
 
-return error;
+if (!port) {
+return error;
+}
+
+if (netdev_is_pmd(port->netdev)) {
+int numa_id = netdev_get_numa_id(port->netdev);
+
+/* PMD threads can not be on invalid numa node. */
+ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
+/* If there is no netdev on the numa node, deletes the pmd threads
+ * for that numa.  Else, deletes the queues from polling lists. */
+if (!has_pmd_port_for_numa(dp, numa_id)) {
+dp_netdev_del_pmds_on_numa(dp, numa_id);
+} else {
+dp_netdev_del_port_from_all_pmds(dp, port);
+}
+}
+
+/* 'port' is RCU protected, we need to wait bef

<    2   3   4   5   6   7   8   9   10   11   >