[ovs-dev] [PATCH v2 3/3] dpctl: Support flush conntrack by conntrack 5-tuple

2017-11-21 Thread Yi-Hung Wei
With this patch, "flush-conntrack" in ovs-dpctl and ovs-appctl accept
a conntrack 5-tuple to delete the conntrack entry specified by the 5-tuple.
For example, user can use the following command to flush a conntrack entry
in zone 5.

$ ovs-dpctl flush-conntrack zone=5 \

'ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=17,ct_tp_src=2,ct_tp_dst=1'
$ ovs-appctl dpctl/flush-conntrack zone=5 \

'ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=17,ct_tp_src=2,ct_tp_dst=1'

VMWare-BZ: #1983178
Signed-off-by: Yi-Hung Wei 
---
 NEWS|   2 +
 lib/ct-dpif.c   | 108 
 lib/ct-dpif.h   |   1 +
 lib/dpctl.c |  76 +++---
 lib/dpctl.man   |  20 +++--
 tests/system-traffic.at |  64 
 utilities/ovs-dpctl.c   |   4 +-
 7 files changed, 253 insertions(+), 22 deletions(-)

diff --git a/NEWS b/NEWS
index a93237f9078b..198ab7e7d09f 100644
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,8 @@ Post-v2.8.0
  IPv6 packets.
- Linux kernel 4.13
  * Add support for compiling OVS with the latest Linux 4.13 kernel
+   - "flush-conntrack" in ovs-dpctl and ovs-appctl now accept a 5-tuple to
+ delete a specific connection tracking entry.
 
 v2.8.0 - 31 Aug 2017
 
diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
index 5cd6b5cfd870..3cad5c928e8b 100644
--- a/lib/ct-dpif.c
+++ b/lib/ct-dpif.c
@@ -20,6 +20,7 @@
 #include 
 
 #include "ct-dpif.h"
+#include "openvswitch/ofp-parse.h"
 #include "openvswitch/vlog.h"
 
 VLOG_DEFINE_THIS_MODULE(ct_dpif);
@@ -434,3 +435,110 @@ ct_dpif_format_tcp_stat(struct ds * ds, int tcp_state, 
int conn_per_state)
 ds_put_cstr(ds, "]");
 ds_put_format(ds, "=%u", conn_per_state);
 }
+
+/* Parses a specification of a conntrack 5-tuple from 's' into 'tuple'.
+ * Returns true on success.  Otherwise, returns false and puts the error
+ * message in 'ds'. */
+bool
+ct_dpif_parse_tuple(struct ct_dpif_tuple *tuple, const char *s, struct ds *ds)
+{
+char *pos, *key, *value, *copy;
+memset(tuple, 0, sizeof *tuple);
+
+pos = copy = xstrdup(s);
+while (ofputil_parse_key_value(, , )) {
+if (!*value) {
+ds_put_format(ds, "field %s missing value", key);
+goto error;
+}
+
+if (!strcmp(key, "ct_nw_src") || !strcmp(key, "ct_nw_dst")) {
+if (tuple->l3_type && tuple->l3_type != AF_INET) {
+ds_put_cstr(ds, "L3 type set multiple times");
+goto error;
+} else {
+tuple->l3_type = AF_INET;
+}
+if (!ip_parse(value, key[6] == 's' ? >src.ip :
+ >dst.ip)) {
+goto error_with_msg;
+}
+} else if (!strcmp(key, "ct_ipv6_src") ||
+   !strcmp(key, "ct_ipv6_dst")) {
+if (tuple->l3_type && tuple->l3_type != AF_INET6) {
+ds_put_cstr(ds, "L3 type set multiple times");
+goto error;
+} else {
+tuple->l3_type = AF_INET6;
+}
+if (!ipv6_parse(value, key[8] == 's' ? >src.in6 :
+   >dst.in6)) {
+goto error_with_msg;
+}
+} else if (!strcmp(key, "ct_nw_proto")) {
+char *err = str_to_u8(value, key, >ip_proto);
+if (err) {
+free(err);
+goto error_with_msg;
+}
+} else if (!strcmp(key, "ct_tp_src") || !strcmp(key,"ct_tp_dst")) {
+uint16_t port;
+char *err = str_to_u16(value, key, );
+if (err) {
+free(err);
+goto error_with_msg;
+}
+if (key[6] == 's') {
+tuple->src_port = htons(port);
+} else {
+tuple->dst_port = htons(port);
+}
+} else if (!strcmp(key, "icmp_type") || !strcmp(key, "icmp_code") ||
+   !strcmp(key, "icmp_id") ) {
+if (tuple->ip_proto != IPPROTO_ICMP &&
+tuple->ip_proto != IPPROTO_ICMPV6) {
+ds_put_cstr(ds, "Invalid L4 fields");
+goto error;
+}
+uint16_t icmp_id;
+char *err;
+if (key[5] == 't') {
+err = str_to_u8(value, key, >icmp_type);
+} else if (key[5] == 'c') {
+err = str_to_u8(value, key, >icmp_code);
+} else {
+err = str_to_u16(value, key, _id);
+tuple->icmp_id = htons(icmp_id);
+}
+if (err) {
+free(err);
+goto error_with_msg;
+}
+}else {
+ds_put_format(ds, "Invalid conntrack tuple field: %s", key);
+goto error;
+}
+}
+
+if (ipv6_is_zero(>src.in6) 

[ovs-dev] [PATCH v2 2/3] conntrack: Support conntrack flush by ct 5-tuple

2017-11-21 Thread Yi-Hung Wei
This patch adds support of flushing a conntrack entry specified by the
conntrack 5-tuple in dpif-netdev.

Signed-off-by: Yi-Hung Wei 
---
 lib/conntrack.c   | 64 +++
 lib/conntrack.h   |  3 +++
 lib/dpif-netdev.c |  2 +-
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index f5a3aa9fab34..c9077c07042c 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2335,6 +2335,18 @@ ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
 }
 
 static void
+ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
+ struct ct_addr *b,
+ const uint16_t l3_type)
+{
+if (l3_type == AF_INET) {
+b->ipv4_aligned = a->ip;
+} else if (l3_type == AF_INET6) {
+b->ipv6_aligned = a->in6;
+}
+}
+
+static void
 conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
 {
 if (key->dl_type == htons(ETH_TYPE_IP)) {
@@ -2359,6 +2371,35 @@ conn_key_to_tuple(const struct conn_key *key, struct 
ct_dpif_tuple *tuple)
 }
 
 static void
+tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone,
+  struct conn_key *key)
+{
+if (tuple->l3_type == AF_INET) {
+key->dl_type = htons(ETH_TYPE_IP);
+} else if (tuple->l3_type == AF_INET6) {
+key->dl_type = htons(ETH_TYPE_IPV6);
+}
+key->nw_proto = tuple->ip_proto;
+ct_dpif_inet_addr_to_ct_endpoint(>src, >src.addr,
+ tuple->l3_type);
+ct_dpif_inet_addr_to_ct_endpoint(>dst, >dst.addr,
+ tuple->l3_type);
+
+if (tuple->ip_proto == IPPROTO_ICMP || tuple->ip_proto == IPPROTO_ICMPV6) {
+key->src.icmp_id = tuple->icmp_id;
+key->src.icmp_type = tuple->icmp_type;
+key->src.icmp_code = tuple->icmp_code;
+key->dst.icmp_id = tuple->icmp_id;
+key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type);
+key->dst.icmp_code = tuple->icmp_code;
+} else {
+key->src.port = tuple->src_port;
+key->dst.port = tuple->dst_port;
+}
+key->zone = zone;
+}
+
+static void
 conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
   long long now, int bkt)
 {
@@ -2485,6 +2526,29 @@ conntrack_flush(struct conntrack *ct, const uint16_t 
*zone)
 return 0;
 }
 
+int
+conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
+  uint16_t zone)
+{
+struct conn_lookup_ctx ctx;
+int error = 0;
+
+memset(, 0, sizeof(ctx));
+tuple_to_conn_key(tuple, zone, );
+ctx.hash = conn_key_hash(, ct->hash_basis);
+unsigned bucket = hash_to_bucket(ctx.hash);
+
+ct_lock_lock(>buckets[bucket].lock);
+conn_key_lookup(>buckets[bucket], , time_msec());
+if (ctx.conn) {
+conn_clean(ct, ctx.conn, >buckets[bucket]);
+} else {
+error = ENOENT;
+}
+ct_lock_unlock(>buckets[bucket].lock);
+return error;
+}
+
 /* This function must be called with the ct->resources read lock taken. */
 static struct alg_exp_node *
 expectation_lookup(struct hmap *alg_expectations,
diff --git a/lib/conntrack.h b/lib/conntrack.h
index fbeef1c4754e..b5b26dd96e00 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -107,6 +107,7 @@ struct conntrack_dump {
 };
 
 struct ct_dpif_entry;
+struct ct_dpif_tuple;
 
 int conntrack_dump_start(struct conntrack *, struct conntrack_dump *,
  const uint16_t *pzone, int *);
@@ -114,6 +115,8 @@ int conntrack_dump_next(struct conntrack_dump *, struct 
ct_dpif_entry *);
 int conntrack_dump_done(struct conntrack_dump *);
 
 int conntrack_flush(struct conntrack *, const uint16_t *zone);
+int conntrack_flush_tuple(struct conntrack *, const struct ct_dpif_tuple *,
+  uint16_t zone);
 
 /* 'struct ct_lock' is a wrapper for an adaptive mutex.  It's useful to try
  * different types of locks (e.g. spinlocks) */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b1ef9a6a5992..527442ebfc2a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5740,7 +5740,7 @@ dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t 
*zone,
 struct dp_netdev *dp = get_dp_netdev(dpif);
 
 if (tuple) {
-return EOPNOTSUPP;
+return conntrack_flush_tuple(>conntrack, tuple, zone ? *zone : 0);
 }
 return conntrack_flush(>conntrack, zone);
 }
-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2 1/3] ct-dpif, dpif-netlink: Support conntrack flush by ct 5-tuple

2017-11-21 Thread Yi-Hung Wei
This patch adds support of flushing a conntrack entry specified by the
conntrack 5-tuple, and provides the implementation in dpif-netlink.
The implementation of dpif-netlink in the linux datapath utilizes the
NFNL_SUBSYS_CTNETLINK netlink subsystem to delete a conntrack entry in
nf_conntrack.

VMWare-BZ: #1983178
Signed-off-by: Yi-Hung Wei 
---
 lib/ct-dpif.c   | 23 +---
 lib/ct-dpif.h   |  3 +-
 lib/dpctl.c |  2 +-
 lib/dpif-netdev.c   |  6 ++-
 lib/dpif-netlink.c  |  7 +++-
 lib/dpif-provider.h | 13 +--
 lib/netlink-conntrack.c | 97 +
 lib/netlink-conntrack.h |  1 +
 ofproto/ofproto-dpif.c  |  2 +-
 tests/ovs-ofctl.at  |  2 +-
 10 files changed, 140 insertions(+), 16 deletions(-)

diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
index c79e69e23970..5cd6b5cfd870 100644
--- a/lib/ct-dpif.c
+++ b/lib/ct-dpif.c
@@ -111,19 +111,30 @@ ct_dpif_dump_done(struct ct_dpif_dump_state *dump)
 }
 
 /* Flush the entries in the connection tracker used by 'dpif'.
- *
- * If 'zone' is not NULL, flush only the entries in '*zone'. */
+ * If both 'zone' and 'tuple' are NULL, flush all the conntrack entries.
+ * If 'zone' is not NULL, and 'tuple' is NULL, flush all the conntrack
+ * entries in '*zone'.
+ * If 'tuple' is not NULL, flush the conntrack entry specified by 'tuple'
+ * in '*zone'. In this case, we use default zone (zone 0) if 'zone' is
+ * NULL. */
 int
-ct_dpif_flush(struct dpif *dpif, const uint16_t *zone)
+ct_dpif_flush(struct dpif *dpif, const uint16_t *zone,
+  const struct ct_dpif_tuple *tuple)
 {
-if (zone) {
-VLOG_DBG("%s: ct_flush: %"PRIu16, dpif_name(dpif), *zone);
+if (tuple) {
+struct ds ds = DS_EMPTY_INITIALIZER;
+ct_dpif_format_tuple(, tuple);
+VLOG_DBG("%s: ct_flush: %s in zone %d", dpif_name(dpif), ds_cstr(),
+zone ? *zone : 0);
+ds_destroy();
+} else if (zone) {
+VLOG_DBG("%s: ct_flush: zone %"PRIu16, dpif_name(dpif), *zone);
 } else {
 VLOG_DBG("%s: ct_flush: ", dpif_name(dpif));
 }
 
 return (dpif->dpif_class->ct_flush
-? dpif->dpif_class->ct_flush(dpif, zone)
+? dpif->dpif_class->ct_flush(dpif, zone, tuple)
 : EOPNOTSUPP);
 }
 
diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h
index d5f966175bc0..ef019050c78e 100644
--- a/lib/ct-dpif.h
+++ b/lib/ct-dpif.h
@@ -195,7 +195,8 @@ int ct_dpif_dump_start(struct dpif *, struct 
ct_dpif_dump_state **,
const uint16_t *zone, int *);
 int ct_dpif_dump_next(struct ct_dpif_dump_state *, struct ct_dpif_entry *);
 int ct_dpif_dump_done(struct ct_dpif_dump_state *);
-int ct_dpif_flush(struct dpif *, const uint16_t *zone);
+int ct_dpif_flush(struct dpif *, const uint16_t *zone,
+  const struct ct_dpif_tuple *);
 void ct_dpif_entry_uninit(struct ct_dpif_entry *);
 void ct_dpif_format_entry(const struct ct_dpif_entry *, struct ds *,
   bool verbose, bool print_stats);
diff --git a/lib/dpctl.c b/lib/dpctl.c
index 6520788aa428..7fc0e3afab37 100644
--- a/lib/dpctl.c
+++ b/lib/dpctl.c
@@ -1353,7 +1353,7 @@ dpctl_flush_conntrack(int argc, const char *argv[],
 return error;
 }
 
-error = ct_dpif_flush(dpif, pzone);
+error = ct_dpif_flush(dpif, pzone, NULL);
 
 dpif_close(dpif);
 return error;
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 0a62630c2712..b1ef9a6a5992 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5734,10 +5734,14 @@ dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
 }
 
 static int
-dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone)
+dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
+ const struct ct_dpif_tuple *tuple)
 {
 struct dp_netdev *dp = get_dp_netdev(dpif);
 
+if (tuple) {
+return EOPNOTSUPP;
+}
 return conntrack_flush(>conntrack, zone);
 }
 
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index c265909f8587..3f76128999d1 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -2892,9 +2892,12 @@ dpif_netlink_ct_dump_done(struct dpif *dpif OVS_UNUSED,
 }
 
 static int
-dpif_netlink_ct_flush(struct dpif *dpif OVS_UNUSED, const uint16_t *zone)
+dpif_netlink_ct_flush(struct dpif *dpif OVS_UNUSED, const uint16_t *zone,
+  const struct ct_dpif_tuple *tuple)
 {
-if (zone) {
+if (tuple) {
+return nl_ct_flush_tuple(tuple, zone ? *zone : 0);
+} else if (zone) {
 return nl_ct_flush_zone(*zone);
 } else {
 return nl_ct_flush();
diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
index 1d82a0939aa1..33d7f2a64367 100644
--- a/lib/dpif-provider.h
+++ b/lib/dpif-provider.h
@@ -75,6 +75,7 @@ dpif_flow_dump_thread_init(struct dpif_flow_dump_thread 
*thread,
 
 struct ct_dpif_dump_state;
 struct ct_dpif_entry;
+struct 

Re: [ovs-dev] [PATCH 2/2] dpctl: Support flush conntrack by conntrack 5-tuple

2017-11-21 Thread Yi-Hung Wei
On Wed, Nov 15, 2017 at 5:28 PM, Justin Pettit  wrote

> >
> > With this patch, ovs-dpctl flush-conntrack accepts a conntrack 5-tuple
>
> I'd also mention ovs-appctl, since ovs-dpctl doesn't work on all platforms.
>
Sure.


> diff --git a/NEWS b/NEWS
> > index 047f34b9f402..800eb84cd24f 100644
> > --- a/NEWS
> > +++ b/NEWS
> > @@ -11,6 +11,8 @@ Post-v2.8.0
> >  * Add support for compiling OVS with the latest Linux 4.13 kernel
> > +   - "ovs-dpctl flush-conntrack" now accepts conntrack 5-tuple to
> delete a
> > + connection tracking entry.
> I would also mention the ovs-appctl, too.  Maybe:
>
> "flush-conntrack" in ovs-dpctl and ovs-appctl now accept a 5-tuple
> to delete a specific connection tracking entry.
>
Sure.


> diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
> > index a9f58f4eb449..bdd9ef5d2551 100644
> > --- a/lib/ct-dpif.c
> > +++ b/lib/ct-dpif.c
> > @@ -20,6 +20,7 @@
> > #include 
> >
> > #include "ct-dpif.h"
> > +#include "openvswitch/ofp-parse.h"
> > #include "openvswitch/vlog.h"
> >
> > VLOG_DEFINE_THIS_MODULE(ct_dpif);
> > @@ -434,3 +435,83 @@ ct_dpif_format_tcp_stat(struct ds * ds, int
> tcp_state, int conn_per_state)
> > ds_put_cstr(ds, "]");
> > ds_put_format(ds, "=%u", conn_per_state);
> > }
> > +
> > +/* Parses a specification of a conntrack 5-tuple from 's' into 'tuple'.
> > + * Returns true on success. Otherwise, returns false and puts the error
> msg
> > + * in 'ds'.
> > + */
>
> We would normally close the function comment on the previous line.  Also,
> I wouldn't abbreviate "message" in a comment.
>
Thanks, done in v2.


> +bool
> > +ct_dpif_parse_tuple(struct ct_dpif_tuple *tuple, const char *s, struct
> ds *ds)
> > +{
> > ...
> > +if (ipv6_is_zero(>src.in6) || ipv6_is_zero(>dst.in6)
> ||
> > +!tuple->ip_proto || !tuple->src_port || !tuple->dst_port) {
> > +ds_put_cstr(ds, "Miss at least one of the conntrack 5-tuple
> field");
>
> I think this error message might be a little clearer: "At least one of the
> conntrack 5-tuple fields is missing."
>
Sure, it is more clear.



> > +goto out2;
> > +}
> > +
> > +free(copy);
> > +return true;
> > +
> > +out:
> > +ds_put_format(ds, "Failed to parse field %s", key);
> > +out2:
> > +free(copy);
> > +return false;
>
> Minor, but I think these labels could be more descriptive.  What about
> "error_with_msg" and "error" or something?
>
Sounds good. I will fold in that in v2.


> diff --git a/lib/dpctl.c b/lib/dpctl.c
> > index 7569189d8f22..682b9bf35a75 100644
> > --- a/lib/dpctl.c
> > +++ b/lib/dpctl.c
> > @@ -1331,14 +1331,32 @@ dpctl_flush_conntrack(int argc, const char
> *argv[],
> >   struct dpctl_params *dpctl_p)
> > {
> > struct dpif *dpif;
> > +struct ct_dpif_tuple tuple, *ptuple = NULL;
> > +struct ds ds = DS_EMPTY_INITIALIZER;
> > uint16_t zone, *pzone = NULL;
> > char *name;
> > int error;
> >
> > +/* Parse ct tuple */
> > +if (argc > 1 && ct_dpif_parse_tuple(, argv[argc-1], )) {
> > +ptuple = 
> > +argc--;
> > +} else if (argc == 4) {
> > +dpctl_error(dpctl_p, EINVAL, "%s", ds_cstr());
> > +ds_destroy();
> > +return EINVAL;
> > +}
> > +ds_destroy();
>
> I don't think this handles the case of an error parsing 'tuple' when a
> datapath name and zone isn't provided.  It also reads a little funny, since
> 'ds' can only have a value if there was an error, but it's called even on
> the non-error condition.  It's obviously correct, since it's a safe
> operation, but it's unnecessary.


> > +
> > +/* Parse zone */
> > if (argc > 1 && ovs_scan(argv[argc - 1], "zone=%"SCNu16, )) {
> > pzone = 
> > argc--;
> > +} else if (argc == 3) {
> > +dpctl_error(dpctl_p, EINVAL, "Invalid zone or conntrack tuple");
> > +return EINVAL;
>
> Same thing here about this only triggering if there wasn't a datapath name.
>
I rewrite the error reporting logic in v2. It catches more error cases, but
it also makes the code to be more complicated since all the 3 parameters
are optional on this command. I hope it can reduce the chance to
accidentally flush conntrack entries.


> diff --git a/lib/dpctl.man b/lib/dpctl.man
> > index 675fe5af4914..f429f1653fa7 100644
> > --- a/lib/dpctl.man
> > +++ b/lib/dpctl.man
> > @@ -217,10 +217,20 @@ are included. With \fB\-\-statistics\fR timeouts
> and timestamps are
> > added to the output.
> > .
> > .TP
> > -\*(DX\fBflush\-conntrack\fR [\fIdp\fR] [\fBzone=\fIzone\fR]
> > -Flushes all the connection entries in the tracker used by \fIdp\fR.
> > +\*(DX\fBflush\-conntrack\fR [\fIdp\fR] [\fBzone=\fIzone\fR]
> [\fBct-tuple\fR]
>
> Shouldn't "ct-tuple" (and all the references below) be using "\fI", since
> it should be an argument?
>
Yes, I will fold in all the suggestion in dpctl.man in v2.

Thanks,

-Yi-Hung
___
dev mailing list
d...@openvswitch.org

[ovs-dev] [PATCH v2 0/3] Support conntrack flush by ct 5-tuple

2017-11-21 Thread Yi-Hung Wei
This patch series add support to delete a connection tracking entry
specified by a 5-tuple.

v1 -> v2:
 * Incoporate suggestions from Justin
 * Add dpif-netdev implementation
 * Improve error reporting logic in dpctl.c
 * Support ICMP 5-tuple
 * Rebase to master


Yi-Hung Wei (3):
  ct-dpif,dpif-netlink: Support conntrack flush by ct 5-tuple
  conntrack: Support conntrack flush by ct 5-tuple
  dpctl: Support flush conntrack by conntrack 5-tuple

 NEWS|   2 +
 lib/conntrack.c |  64 +++
 lib/conntrack.h |   3 ++
 lib/ct-dpif.c   | 131 +---
 lib/ct-dpif.h   |   4 +-
 lib/dpctl.c |  76 ++--
 lib/dpctl.man   |  20 ++--
 lib/dpif-netdev.c   |   6 ++-
 lib/dpif-netlink.c  |   7 ++-
 lib/dpif-provider.h |  13 +++--
 lib/netlink-conntrack.c |  97 +++
 lib/netlink-conntrack.h |   1 +
 ofproto/ofproto-dpif.c  |   2 +-
 tests/ovs-ofctl.at  |   2 +-
 tests/system-traffic.at |  64 +++
 utilities/ovs-dpctl.c   |   4 +-
 16 files changed, 459 insertions(+), 37 deletions(-)

-- 
2.7.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH 1/2] ct-dpif, dpif-netlink: Support conntrack flush by ct 5-tuple

2017-11-21 Thread Yi-Hung Wei
Thanks Justin for reviewing this series. I will repin v2 based on your
comments.

On Wed, Nov 15, 2017 at 4:28 PM, Justin Pettit  wrote:

> > --- a/lib/ct-dpif.c
> > +++ b/lib/ct-dpif.c
> > @@ -111,19 +111,30 @@ ct_dpif_dump_done(struct ct_dpif_dump_state *dump)
> > }
> >
> > /* Flush the entries in the connection tracker used by 'dpif'.
> > - *
> > - * If 'zone' is not NULL, flush only the entries in '*zone'. */
> > + * If both 'zone' and 'tuple' are NULL, flush all the conntrack entries.
> > + * If 'zone' is not NULL, and 'tuple's is NULL, flush all the conntrack
>
> I'd drop the "s" in "'tuple's".
>
Sure.


>
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index 599308d257a8..0e1696a94e31 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -5741,10 +5741,14 @@ dpif_netdev_ct_dump_done(struct dpif *dpif
> OVS_UNUSED,
> > }
> >
> > static int
> > -dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone)
> > +dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
> > + const struct ct_dpif_tuple *tuple)
> > {
> > struct dp_netdev *dp = get_dp_netdev(dpif);
> >
> > +if (tuple) {
> > +return EOPNOTSUPP;
> > +}
> > return conntrack_flush(>conntrack, zone);
> > }
>
> Do you plan on implementing this?
>
Yes. Will have that in v2.


> > diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
> > index 1d82a0939aa1..083d67c8d5b0 100644
> > --- a/lib/dpif-provider.h
> > +++ b/lib/dpif-provider.h
> > ...
> > -/* Flushes the connection tracking tables. If 'zone' is not NULL,
> > - * only deletes connections in '*zone'. */
> > -int (*ct_flush)(struct dpif *, const uint16_t *zone);
> > +/* Flushes the connection tracking tables.
> > + * If both 'zone' and 'tuple' are NULL, flush all the conntrack
> entries.
> > + * If 'zone' is not NULL, and 'tuple's is NULL, flush all the
> conntrack
> > + * entries in '*zone'.
>
> Same comment as above about "'tuple's'".
>
Done in v2.


>
> > diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c
> > index 1e1bb2f79d1d..9094779aaa55 100644
> > --- a/lib/netlink-conntrack.c
> > +++ b/lib/netlink-conntrack.c
> > 
> > +int
> > +nl_ct_delete(const struct ct_dpif_tuple *tuple, uint16_t zone)
> > +{
> > +int err;
> > +struct ofpbuf buf;
> > +
> > +ofpbuf_init(, NL_DUMP_BUFSIZE);
> > +nl_msg_put_nfgenmsg(, 0, tuple->l3_type, NFNL_SUBSYS_CTNETLINK,
> > +IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
> > +
> > +nl_msg_put_be16(, CTA_ZONE, htons(zone));
> > +if (!nl_ct_put_ct_tuple(, tuple, CTA_TUPLE_ORIG)) {
> > +err = EOPNOTSUPP;
> > +goto out;
> > +}
> > +err = nl_transact(NETLINK_NETFILTER, , NULL);
> > +out:
> > +ofpbuf_uninit();
> > +return err;
> > +}
>
> The Linux code for nl_ct_flush_zone() seems to go through some contortions
> due to there being a problem with flushing a specific zone.  I took a brief
> look at the kernel code, and it looks like it does properly handle zones if
> the tuple is provided, but not otherwise.  Is that a correct reading?
>
Yes, that is correct.

On a more cosmetic note, most of the external interface refers to deleting
> a specific conntrack entry as a "flush", but the internal code refers to it
> as a "delete". I think this is a reasonable distinction, but if it's not
> carried externally, then it doesn't seem necessary internally.
>
Right, it make sense to rename the "delete" to "flush" to make it
consistent.

Also, this delete and flush code is nearly identical, so it seems like
> there's some opportunity to just call it "flush" and consolidate to a
> single implementation that handles a tuple if it's provided.
>
I am more incline to rename the nl_ct_delete() to nl_ct_flush_tuple().  I
agree that there are some code duplication in the delete and flush code,
but consolidate them to a single implementation will lead to the following
two functions. It is a bit confusing since the first one will either flush
all entries or flush by a tuple.
- nl_ct_flush(const struct ct_dpif_tuple *tupe, uint16_t zone); // flush
all or a tuple
- nl_ct_flush_zone(uint16_t zone) // flush a particular zone

Instead, if we rename nl_ct_delete() to nl_ct_flush_tuple(), we will have
the following three functions, and it seems to be more clear to me.
- nl_ct_flush(void) // flush all
- nl_ct_flush_zone(uint16_t zone) // flush a particular zone
- nl_ct_flush_tuple(const struct ct_dpif_tuple *tuple, uint16_t zone) //
flush a tuple


> +static bool
> > +nl_ct_put_tuple(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple)
> > +{
> > +if (!nl_ct_put_tuple_ip(buf, tuple)) {
> > +return false;
> > +}
> > +if (!nl_ct_put_tuple_proto(buf, tuple)) {
> > +return false;
> > +}
> > +return true;
> > +}
> > +
> > +static bool
> > +nl_ct_put_ct_tuple(struct ofpbuf *buf, const struct ct_dpif_tuple
> *tuple,
> > +   enum ctattr_type 

[ovs-dev] [PATCH v1] datapath-windows: Add support for deleting conntrack entry by 5-tuple.

2017-11-21 Thread Anand Kumar
To delete a conntrack entry specified by 5-tuple pass an additional
conntrack 5-tuple parameter to flush-conntrack.

Signed-off-by: Anand Kumar 
---
 datapath-windows/ovsext/Conntrack.c | 146 +---
 1 file changed, 134 insertions(+), 12 deletions(-)

diff --git a/datapath-windows/ovsext/Conntrack.c 
b/datapath-windows/ovsext/Conntrack.c
index 3203411..dc268b3 100644
--- a/datapath-windows/ovsext/Conntrack.c
+++ b/datapath-windows/ovsext/Conntrack.c
@@ -35,8 +35,10 @@ static PNDIS_RW_LOCK_EX ovsConntrackLockObj;
 extern POVS_SWITCH_CONTEXT gOvsSwitchContext;
 static UINT64 ctTotalEntries;
 
-static __inline NDIS_STATUS OvsCtFlush(UINT16 zone);
-
+static __inline OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple);
+static __inline NDIS_STATUS
+MapNlToCtTuple(POVS_MESSAGE msgIn, PNL_ATTR attr,
+   struct ovs_key_ct_tuple_ipv4 *ct_tuple);
 /*
  *
  * OvsInitConntrack
@@ -120,7 +122,7 @@ OvsCleanupConntrack(VOID)
 ObDereferenceObject(ctThreadCtx.threadObject);
 
 /* Force flush all entries before removing */
-OvsCtFlush(0);
+OvsCtFlush(0, NULL);
 
 if (ovsConntrackTable) {
 OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG);
@@ -1018,11 +1020,11 @@ OvsConntrackEntryCleaner(PVOID data)
 /*
  *
  * OvsCtFlush
- * Flushes out all Conntrack Entries that match the given zone
+ * Flushes out all Conntrack Entries that match any of the arguments
  *
  */
 static __inline NDIS_STATUS
-OvsCtFlush(UINT16 zone)
+OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple)
 {
 PLIST_ENTRY link, next;
 POVS_CT_ENTRY entry;
@@ -1034,9 +1036,26 @@ OvsCtFlush(UINT16 zone)
 for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) {
 LIST_FORALL_SAFE([i], link, next) {
 entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link);
-/* zone is a non-zero value */
-if (!zone || zone == entry->key.zone)
+if (tuple) {
+if (tuple->ipv4_proto != IPPROTO_ICMP &&
+tuple->ipv4_src == entry->key.src.addr.ipv4_aligned &&
+tuple->ipv4_dst == entry->key.dst.addr.ipv4_aligned &&
+tuple->ipv4_proto == entry->key.nw_proto &&
+tuple->src_port == entry->key.src.port &&
+tuple->dst_port == entry->key.dst.port &&
+(zone ? entry->key.zone == zone: TRUE)) {
+OvsCtEntryDelete(entry);
+} else if (tuple->ipv4_src == 
entry->key.src.addr.ipv4_aligned &&
+tuple->ipv4_dst == entry->key.dst.addr.ipv4_aligned &&
+tuple->ipv4_proto == entry->key.nw_proto &&
+tuple->src_port == entry->key.src.icmp_type &&
+tuple->dst_port == entry->key.src.icmp_code &&
+(zone ? entry->key.zone == zone: TRUE)) {
+OvsCtEntryDelete(entry);
+}
+} else if (!zone || zone == entry->key.zone) {
 OvsCtEntryDelete(entry);
+}
 }
 }
 }
@@ -1058,19 +1077,21 @@ OvsCtDeleteCmdHandler(POVS_USER_PARAMS_CONTEXT 
usrParamsCtx,
 NL_ERROR nlError = NL_ERROR_SUCCESS;
 NTSTATUS status;
 UINT16 zone = 0;
+struct ovs_key_ct_tuple_ipv4 *ct_tuple = NULL;
 NL_BUFFER nlBuf;
 UINT16 nlmsgType;
 PNL_MSG_HDR nlMsg;
 
-static const NL_POLICY ctZonePolicy[] = {
-[CTA_ZONE] = { .type = NL_A_BE16, .optional = TRUE },
+static const NL_POLICY ctAttrPolicy[] = {
+[CTA_TUPLE_ORIG] = {.type = NL_A_NESTED, .optional = TRUE},
+[CTA_ZONE] = {.type = NL_A_BE16, .optional = TRUE },
 };
 
 if ((NlAttrParse(nlMsgHdr, attrOffset, NlNfMsgAttrsLen(nlMsgHdr),
-ctZonePolicy, ARRAY_SIZE(ctZonePolicy),
+ctAttrPolicy, ARRAY_SIZE(ctAttrPolicy),
 ctAttrs, ARRAY_SIZE(ctAttrs)))
 != TRUE) {
-OVS_LOG_ERROR("Zone attr parsing failed for msg: %p", nlMsgHdr);
+OVS_LOG_ERROR("Ct attr parsing failed for msg: %p", nlMsgHdr);
 status = STATUS_INVALID_PARAMETER;
 goto done;
 }
@@ -1079,7 +1100,21 @@ OvsCtDeleteCmdHandler(POVS_USER_PARAMS_CONTEXT 
usrParamsCtx,
 zone = NlAttrGetU16(ctAttrs[CTA_ZONE]);
 }
 
-status = OvsCtFlush(zone);
+if (ctAttrs[CTA_TUPLE_ORIG]) {
+ct_tuple = OvsAllocateMemoryWithTag(sizeof(struct 
ovs_key_ct_tuple_ipv4),
+OVS_CT_POOL_TAG);
+if (ct_tuple == NULL) {
+status = STATUS_INSUFFICIENT_RESOURCES;
+goto done;
+}
+/* 

[ovs-dev] [PATCH v1] Add support to delete a conntrack entry by 5-tuple

2017-11-21 Thread Anand Kumar
This patch adds support for deleting conntrack entry by conntrack 5-tuple
in windows kernel.

The related userspace changes are currrently being reviewed
https://patchwork.ozlabs.org/project/openvswitch/list/?series=13345

The windows kernel changes can go in independent of the userspace changes
since conntrack 5-tuple is an optional parameter.

Anand Kumar (1):
  datapath-windows: Add support for deleting conntrack entry by 5 tuple.

 datapath-windows/ovsext/Conntrack.c | 146 +---
 1 file changed, 134 insertions(+), 12 deletions(-)

-- 
2.9.3.windows.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 0/5] dpif-netdev: Cuckoo-Distributor implementation

2017-11-21 Thread Wang, Yipeng1
Hi, Jan,

Thanks for sharing the patch and nice talking with you in the conference. Could 
you tell us more about the test case you used such as the rule set and traffic 
pattern? Since we have some other work at hand to be done soon, we will 
definitely test the patch later and get back to you in a week or two.

Thank
Yipeng

> -Original Message-
> From: Jan Scheurich [mailto:jan.scheur...@ericsson.com]
> Sent: Monday, November 20, 2017 9:39 AM
> To: Wang, Yipeng1 ; d...@openvswitch.org
> Cc: Gobriel, Sameh ; Fischetti, Antonio
> ; db...@vmware.com; Ben Pfaff (b...@ovn.org)
> 
> Subject: RE: [PATCH v2 0/5] dpif-netdev: Cuckoo-Distributor implementation
> 
> Hi Yipeng and Sameh,
> 
> As discussed on the OVS conference, I have just sent my Datapath Flow
> Cache patch to the ML:
> https://mail.openvswitch.org/pipermail/ovs-dev/2017-
> November/341066.html
> http://patchwork.ozlabs.org/patch/839645/
> 
> I would be happy if you could test it in your setup and compare the
> performance with your CD implementation. I don't have a test bed at hand
> where I could easily test the speedup for many visited subtables.
> 
> Thanks, Jan
> 
> > -Original Message-
> > From: Yipeng Wang [mailto:yipeng1.w...@intel.com]
> > Sent: Wednesday, 01 November, 2017 00:40
> > To: d...@openvswitch.org
> > Cc: yipeng1.w...@intel.com; sameh.gobr...@intel.com;
> antonio.fische...@intel.com; db...@vmware.com; Jan Scheurich
> > 
> > Subject: [PATCH v2 0/5] dpif-netdev: Cuckoo-Distributor implementation
> >
> > The Datapath Classifier uses tuple space search for flow classification.
> > The rules are arranged into a set of tuples/subtables (each with a
> > distinct mask).  Each subtable is implemented as a hash table and lookup
> > is done with flow keys formed by selecting the bits from the packet header
> > based on each subtable's mask. Tuple space search will sequentially search
> > each subtable until a match is found. With a large number of subtables, a
> > sequential search of the subtables could consume a lot of CPU cycles. In
> > a testbench with a uniform traffic pattern equally distributed across 20
> > subtables, we measured that up to 65% of total execution time is
> attributed
> > to the megaflow cache lookup.
> >
> > This patch presents the idea of the two-layer hierarchical lookup, where a
> > low overhead first level of indirection is accessed first, we call this
> > level cuckoo distributor (CD). If a flow key has been inserted in the flow
> > table the first level will indicate with high probability that which
> > subtable to look into. A lookup is performed on the second level (the
> > target subtable) to retrieve the result. If the key doesn’t have a match,
> > then we revert back to the sequential search of subtables. The patch is
> > partially inspired by earlier concepts proposed in "simTable"[1] and
> > "Cuckoo Filter"[2], and DPDK's Cuckoo Hash implementation.
> >
> > This patch can improve the already existing Subtable Ranking when traffic
> > data has high entropy. Subtable Ranking helps minimize the number of
> > traversed subtables when most of the traffic hit the same subtable.
> > However, in the case of high entropy traffic such as traffic coming from
> > a physical port, multiple subtables could be hit with a similar frequency.
> > In this case the average subtable lookups per hit would be much greater
> > than 1. In addition, CD can adaptively turn off when it finds the traffic
> > mostly hit one subtable. Thus, CD will not be an overhead when Subtable
> > Ranking works well.
> >
> > Scheme:
> > CD is in front of the subtables. Packets are directed to corresponding
> subtable
> > if hit in CD instead of searching each subtable sequentially.
> >  ---
> > |  CD   |
> >  ---
> >\
> > \
> >  -  - -
> > |sub  ||sub  |...|sub  |
> > |table||table|   |table|
> >  -  - -
> >
> >  Evaluation:
> >  --
> > We create a set of rules with various src IP. We feed traffic containing
> various
> > numbers of flows with various src IP and dst IP. All the flows hit 10/20/30
> > rules creating 10/20/30 subtables. We will explain the rule/traffic setup
> > in detail later.
> >
> > The table below shows the preliminary continuous testing results (full line
> > speed test) we collected with a uni-directional phy-to-phy setup. OvS
> > runs with 1 PMD. We use Spirent as the hardware traffic generator.
> >
> >  Before v2 rebase:
> >  
> > AVX2 data:
> > 20k flows:
> > no.subtable: 10  20  30
> > cd-ovs   4267332 3478251 3126763
> > orig-ovs 3260883 2174551 1689981
> > speedup  1.31x   1.60x   1.85x
> >
> > 100k flows:
> > no.subtable: 10  20  30
> > cd-ovs   4015783 3276100 2970645
> > orig-ovs 2692882 1711955 1302321
> > speedup  

[ovs-dev] Gestión de Tesorería

2017-11-21 Thread Máxima rentabilidad y riesgos mínimos
Liquidez en equilibrio para hacer frente a los retos financieros

Gestión de Tesorería: Importancia ante una crisis financiera
24 de noviembre - C.P. y MAN. Fernando García Zárate 9am-6pm

Obtendrá las herramientas necesarias que le permitirán hacer frente a los retos 
en caso de una crisis financiera, a través de la optimización del ciclo 
económico, maximización del rendimiento operativo y de inversión, así como en 
la reducción de costos. Identifique y prevenga riesgos de liquidez, conociendo 
los instrumentos para su cobertura. 

BENEFICIOS DE ASISTIR: 

-Entenderá el papel del tesorero en la empresa y su relación con el Director 
Financiero 
-Aprenderá las herramientas para optimizar el flujo de caja
 -Conocerá la importancia del presupuesto y liquidez de la tesorería
 -Identificará los riesgos de un mal manejo de tesorería 

¿Requiere la información a la Brevedad? responda este email con la palabra: 
Tesorería + nombre - teléfono - correo.


centro telefónico:018002120744


 


___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 0/9] convert Perl to Python

2017-11-21 Thread Aaron Conole
Ben Pfaff  writes:

> This series gets rid of all usage of Perl in the tree, converting
> it to Python instead.
>
> v1->v2:
>   * Minor fixes from Aaron Conole.
>   * Break soexpand and sodepends common code into library (suggested by
> Aaron Conole)   
>
> Ben Pfaff (9):
>   tests: Convert uuidfilt utility from Perl to Python.
>   tests: Convert ovsdb-monitor-sort utility from Perl to Python.
>   tests: Convert flowgen utility from Perl to Python.
>   tests: Convert dpdkstrip utility from Perl to Python.
>   tests: Convert soexpand build tool from Perl to Python.
>   tests: Convert sodepends build tool from Perl to Python.
>   tests: Convert dot2pic build tool from Perl to Python.
>   tests: Convert miscellaneous test code from Perl to Python.
>   Remove Perl dependency.

For the series:

Acked-by: Aaron Conole 


>  Documentation/intro/install/general.rst |   8 -
>  Makefile.am |  16 +-
>  build-aux/automake.mk   |   5 +-
>  build-aux/dpdkstrip.pl  |  35 -
>  build-aux/dpdkstrip.py  |  48 ++
>  build-aux/sodepends.pl  |  70 -
>  build-aux/sodepends.py  |  68 +
>  build-aux/soexpand.pl   |  40 -
>  build-aux/soexpand.py   |  59 
>  configure.ac|   6 -
>  ovn/automake.mk |   4 +-
>  ovsdb/dot2pic   | 155 +++
>  python/automake.mk  |   3 +-
>  python/build/soutil.py  |  56 +++
>  rhel/openvswitch-fedora.spec.in |   2 +-
>  tests/atlocal.in|   9 +-
>  tests/automake.mk   |   7 +-
>  tests/flowgen.pl| 253 
> 
>  tests/flowgen.py| 240 ++
>  tests/library.at|   2 +-
>  tests/ofproto-macros.at |   4 +-
>  tests/ofproto.at|  62 
>  tests/ovn-controller.at |   2 +-
>  tests/ovn-nbctl.at  | 102 ++---
>  tests/ovs-macros.at |   5 +-
>  tests/ovs-vsctl.at  |  16 +-
>  tests/ovsdb-execution.at|   4 +-
>  tests/ovsdb-idl.at  |  30 ++--
>  tests/ovsdb-monitor-sort.pl |  52 ---
>  tests/ovsdb-monitor-sort.py |  85 +++
>  tests/ovsdb-monitor.at  |   4 +-
>  tests/ovsdb-rbac.at |  28 ++--
>  tests/ovsdb-server.at   |  36 ++---
>  tests/ovsdb-tool.at |  18 +--
>  tests/ovsdb-trigger.at  |   2 +-
>  tests/system-kmod-macros.at |   4 +-
>  tests/system-userspace-macros.at|   4 +-
>  tests/uuidfilt.pl   |  33 -
>  tests/uuidfilt.py   |  50 +++
>  vswitchd/automake.mk|   2 +-
>  vtep/automake.mk|   2 +-
>  41 files changed, 891 insertions(+), 740 deletions(-)
>  delete mode 100644 build-aux/dpdkstrip.pl
>  create mode 100755 build-aux/dpdkstrip.py
>  delete mode 100644 build-aux/sodepends.pl
>  create mode 100755 build-aux/sodepends.py
>  delete mode 100644 build-aux/soexpand.pl
>  create mode 100755 build-aux/soexpand.py
>  create mode 100755 python/build/soutil.py
>  delete mode 100755 tests/flowgen.pl
>  create mode 100755 tests/flowgen.py
>  delete mode 100755 tests/ovsdb-monitor-sort.pl
>  create mode 100755 tests/ovsdb-monitor-sort.py
>  delete mode 100755 tests/uuidfilt.pl
>  create mode 100755 tests/uuidfilt.py
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 1/9] tests: Convert uuidfilt utility from Perl to Python.

2017-11-21 Thread Aaron Conole
Ben Pfaff  writes:

> Perl is unfashionable and Python is more widely available and understood,
> so this commit converts one of the OVS uses of Perl into Python.
>
> Signed-off-by: Ben Pfaff 
> ---
>  tests/automake.mk|   5 +-
>  tests/ofproto-macros.at  |   4 +-
>  tests/ofproto.at |  24 -
>  tests/ovn-controller.at  |   2 +-
>  tests/ovn-nbctl.at   | 102 
> +++
>  tests/ovs-macros.at  |   4 ++
>  tests/ovs-vsctl.at   |  16 +++---
>  tests/ovsdb-execution.at |   4 +-
>  tests/ovsdb-idl.at   |  30 ++--
>  tests/ovsdb-monitor.at   |   4 +-
>  tests/ovsdb-rbac.at  |  28 +--
>  tests/ovsdb-server.at|  36 +++---
>  tests/ovsdb-tool.at  |  18 +++
>  tests/ovsdb-trigger.at   |   2 +-
>  tests/system-kmod-macros.at  |   4 +-
>  tests/system-userspace-macros.at |   4 +-
>  tests/uuidfilt.pl|  33 -
>  tests/uuidfilt.py|  50 +++
>  18 files changed, 196 insertions(+), 174 deletions(-)
>  delete mode 100755 tests/uuidfilt.pl
>  create mode 100755 tests/uuidfilt.py
>
> diff --git a/tests/automake.mk b/tests/automake.mk
> index 156b40f58d7c..3ca60e2ea450 100644
> --- a/tests/automake.mk
> +++ b/tests/automake.mk
> @@ -290,7 +290,7 @@ $(srcdir)/package.m4: $(top_srcdir)/configure.ac
>  noinst_PROGRAMS += tests/test-ovsdb
>  tests_test_ovsdb_SOURCES = tests/test-ovsdb.c
>  nodist_tests_test_ovsdb_SOURCES = tests/idltest.c tests/idltest.h
> -EXTRA_DIST += tests/uuidfilt.pl tests/ovsdb-monitor-sort.pl
> +EXTRA_DIST += tests/ovsdb-monitor-sort.pl
>  tests_test_ovsdb_LDADD = ovsdb/libovsdb.la lib/libopenvswitch.la
>  
>  noinst_PROGRAMS += tests/test-lib
> @@ -389,7 +389,8 @@ CHECK_PYFILES = \
>   tests/MockXenAPI.py \
>   tests/test-unix-socket.py \
>   tests/test-unixctl.py \
> - tests/test-vlog.py
> + tests/test-vlog.py \
> + tests/uuidfilt.py
>  EXTRA_DIST += $(CHECK_PYFILES)
>  PYCOV_CLEAN_FILES += $(CHECK_PYFILES:.py=.py,cover) .coverage
>  
> diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at
> index 38449db32ee0..4fbb10070342 100644
> --- a/tests/ofproto-macros.at
> +++ b/tests/ofproto-macros.at
> @@ -353,7 +353,7 @@ m4_define([_OVS_VSWITCHD_START],
>  # br0 with predictable settings, passing 'vsctl-args' as additional
>  # commands to ovs-vsctl.  If 'vsctl-args' causes ovs-vsctl to provide
>  # output (e.g. because it includes "create" commands) then 'vsctl-output'
> -# specifies the expected output after filtering through uuidfilt.pl.
> +# specifies the expected output after filtering through uuidfilt.
>  #
>  # If a test needs to use "system" devices (as dummies), then specify
>  # =override (literally) as the third argument.  Otherwise, system devices
> @@ -364,7 +364,7 @@ m4_define([_OVS_VSWITCHD_START],
>  # to ovs-vswitchd
>  m4_define([OVS_VSWITCHD_START],
>[_OVS_VSWITCHD_START([--enable-dummy$3 --disable-system $4])
> -   AT_CHECK([add_of_br 0 $1 m4_if([$2], [], [], [| ${PERL} 
> $srcdir/uuidfilt.pl])], [0], [$2])
> +   AT_CHECK([add_of_br 0 $1 m4_if([$2], [], [], [| uuidfilt])], [0], [$2])
>  ])
>  
>  # check_logs scans through all *.log files (except '*.log' and testsuite.log)
> diff --git a/tests/ofproto.at b/tests/ofproto.at
> index 31cb5208fc1c..9853a21db989 100644
> --- a/tests/ofproto.at
> +++ b/tests/ofproto.at
> @@ -2196,7 +2196,7 @@ AT_CHECK(
>   -- --id=@t0 create Flow_Table name=main \
>   -- --id=@t1 create Flow_Table flow-limit=1024 \
>   -- set bridge br0 'flow_tables={1=@t1,0=@t0}' \
> -   | ${PERL} $srcdir/uuidfilt.pl],
> +   | uuidfilt],
>[0], [<0>
>  <1>
>  ])
> @@ -2348,7 +2348,7 @@ AT_CHECK(
>   -- --id=@t0 create Flow_Table name=main \
>   -- --id=@t1 create Flow_Table flow-limit=1024 \
>   -- set bridge br0 'flow_tables={1=@t1,0=@t0}' \
> -   | ${PERL} $srcdir/uuidfilt.pl],
> +   | uuidfilt],
>[0], [<0>
>  <1>
>  ])
> @@ -2603,7 +2603,7 @@ AT_CHECK(
>   -- --id=@t0 create Flow_Table name=main \
>   -- --id=@t1 create Flow_Table flow-limit=1024 \
>   -- set bridge br0 'flow_tables={1=@t1,0=@t0}' \
> -   | ${PERL} $srcdir/uuidfilt.pl],
> +   | uuidfilt],
>[0], [<0>
>  <1>
>  ])
> @@ -2655,7 +2655,7 @@ AT_CHECK(
>[ovs-vsctl \
>   -- --id=@t0 create Flow_Table flow-limit=4 \
>   -- set bridge br0 flow_tables:0=@t0 \
> -   | ${PERL} $srcdir/uuidfilt.pl],
> +   | uuidfilt],
>[0], [<0>
>  ])
>  # Add 4 flows.
> @@ -2699,7 +2699,7 @@ AT_CHECK(
>[ovs-vsctl \
>   -- --id=@t0 create Flow_Table flow-limit=4 \
>   -- set bridge br0 flow_tables:0=@t0 \
> -   | ${PERL} $srcdir/uuidfilt.pl],
> +   | uuidfilt],
>[0], [<0>
>  ])
>  # Add 4 flows.
> @@ -2738,7 +2738,7 @@ AT_CHECK(
>[ovs-vsctl \
>   -- --id=@t0 create Flow_Table flow-limit=4 overflow-policy=evict 

[ovs-dev] [RFC PATCH v3 8/8] netdev-dpdk: support multi-segment jumbo frames

2017-11-21 Thread Mark Kavanagh
Currently, jumbo frame support for OvS-DPDK is implemented by
increasing the size of mbufs within a mempool, such that each mbuf
within the pool is large enough to contain an entire jumbo frame of
a user-defined size. Typically, for each user-defined MTU,
'requested_mtu', a new mempool is created, containing mbufs of size
~requested_mtu.

With the multi-segment approach, a port uses a single mempool,
(containing standard/default-sized mbufs of ~2k bytes), irrespective
of the user-requested MTU value. To accommodate jumbo frames, mbufs
are chained together, where each mbuf in the chain stores a portion of
the jumbo frame. Each mbuf in the chain is termed a segment, hence the
name.

== Enabling multi-segment mbufs ==
Multi-segment and single-segment mbufs are mutually exclusive, and the
user must decide on which approach to adopt on init. The introduction
of a new OVSDB field, 'dpdk-multi-seg-mbufs', facilitates this. This
is a global boolean value, which determines how jumbo frames are
represented across all DPDK ports. In the absence of a user-supplied
value, 'dpdk-multi-seg-mbufs' defaults to false, i.e. multi-segment
mbufs must be explicitly enabled / single-segment mbufs remain the
default.

Setting the field is identical to setting existing DPDK-specific OVSDB
fields:

ovs-vsctl set Open_vSwitch . other_config:dpdk-init=true
ovs-vsctl set Open_vSwitch . other_config:dpdk-lcore-mask=0x10
ovs-vsctl set Open_vSwitch . other_config:dpdk-socket-mem=4096,0
==> ovs-vsctl set Open_vSwitch . other_config:dpdk-multi-seg-mbufs=true

Signed-off-by: Mark Kavanagh 
---
 NEWS |  1 +
 lib/dpdk.c   |  7 +++
 lib/netdev-dpdk.c| 43 ---
 lib/netdev-dpdk.h|  1 +
 vswitchd/vswitch.xml | 20 
 5 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/NEWS b/NEWS
index c15dc24..657b598 100644
--- a/NEWS
+++ b/NEWS
@@ -15,6 +15,7 @@ Post-v2.8.0
- DPDK:
  * Add support for DPDK v17.11
  * Add support for vHost IOMMU feature
+ * Add support for multi-segment mbufs
 
 v2.8.0 - 31 Aug 2017
 
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 8da6c32..4c28bd0 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -450,6 +450,13 @@ dpdk_init__(const struct smap *ovs_other_config)
 
 /* Finally, register the dpdk classes */
 netdev_dpdk_register();
+
+bool multi_seg_mbufs_enable = smap_get_bool(ovs_other_config,
+"dpdk-multi-seg-mbufs", false);
+if (multi_seg_mbufs_enable) {
+VLOG_INFO("DPDK multi-segment mbufs enabled\n");
+netdev_dpdk_multi_segment_mbufs_enable();
+}
 }
 
 void
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 36275bd..293edad 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -65,6 +65,7 @@ enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
 
 VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+static bool dpdk_multi_segment_mbufs = false;
 
 #define DPDK_PORT_WATCHDOG_INTERVAL 5
 
@@ -500,6 +501,7 @@ dpdk_mp_create(struct netdev_dpdk *dev, uint16_t frame_len)
   + dev->requested_n_txq * dev->requested_txq_size
   + MIN(RTE_MAX_LCORE, dev->requested_n_rxq) * NETDEV_MAX_BURST
   + MIN_NB_MBUF;
+/* XXX (RFC) - should n_mbufs be increased if multi-seg mbufs are used? */
 
 ovs_mutex_lock(_mp_mutex);
 do {
@@ -568,7 +570,13 @@ dpdk_mp_free(struct rte_mempool *mp)
 
 /* Tries to allocate a new mempool - or re-use an existing one where
  * appropriate - on requested_socket_id with a size determined by
- * requested_mtu and requested Rx/Tx queues.
+ * requested_mtu and requested Rx/Tx queues. Some properties of the mempool's
+ * elements are dependent on the value of 'dpdk_multi_segment_mbufs':
+ * - if 'true', then the mempool contains standard-sized mbufs that are chained
+ *   together to accommodate packets of size 'requested_mtu'.
+ * - if 'false', then the members of the allocated mempool are
+ *   non-standard-sized mbufs. Each mbuf in the mempool is large enough to 
fully
+ *   accomdate packets of size 'requested_mtu'.
  * On success - or when re-using an existing mempool - the new configuration
  * will be applied.
  * On error, device will be left unchanged. */
@@ -576,10 +584,18 @@ static int
 netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
 OVS_REQUIRES(dev->mutex)
 {
-uint16_t buf_size = dpdk_buf_size(dev->requested_mtu);
+uint16_t buf_size = 0;
 struct rte_mempool *mp;
 int ret = 0;
 
+/* Contiguous mbufs in use - permit oversized mbufs */
+if (!dpdk_multi_segment_mbufs) {
+buf_size = dpdk_buf_size(dev->requested_mtu);
+} else {
+/* multi-segment mbufs - use standard mbuf size */
+buf_size = dpdk_buf_size(ETHER_MTU);
+}
+
 mp = dpdk_mp_create(dev, buf_size);
 if (!mp) {
 VLOG_ERR("Failed to create memory pool for 

[ovs-dev] [RFC PATCH v3 7/8] netdev-dpdk: copy large packet to multi-seg. mbufs

2017-11-21 Thread Mark Kavanagh
From: Michael Qiu 

Currently, packets are only copied to a single segment in
the function dpdk_do_tx_copy(). This could be an issue in
the case of jumbo frames, particularly when multi-segment
mbufs are involved.

This patch calculates the number of segments needed by a
packet and copies the data to each segment.

Signed-off-by: Michael Qiu 
Signed-off-by: Mark Kavanagh 
---
 lib/netdev-dpdk.c | 55 +++
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 61a0dca..36275bd 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -1824,8 +1824,10 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct 
dp_packet_batch *batch)
 #endif
 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
 struct rte_mbuf *pkts[PKT_ARRAY_SIZE];
+struct rte_mbuf *temp, *head = NULL;
 uint32_t cnt = batch_cnt;
 uint32_t dropped = 0;
+uint32_t i, j, nb_segs;
 
 if (dev->type != DPDK_DEV_VHOST) {
 /* Check if QoS has been configured for this netdev. */
@@ -1838,9 +1840,10 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct 
dp_packet_batch *batch)
 
 uint32_t txcnt = 0;
 
-for (uint32_t i = 0; i < cnt; i++) {
+for (i = 0; i < cnt; i++) {
 struct dp_packet *packet = batch->packets[i];
 uint32_t size = dp_packet_size(packet);
+uint16_t max_data_len, data_len;
 
 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
 VLOG_WARN_RL(, "Too big size %u max_packet_len %d",
@@ -1850,15 +1853,59 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct 
dp_packet_batch *batch)
 continue;
 }
 
-pkts[txcnt] = rte_pktmbuf_alloc(dev->mp);
+temp = pkts[txcnt] = rte_pktmbuf_alloc(dev->mp);
 if (OVS_UNLIKELY(!pkts[txcnt])) {
 dropped += cnt - i;
 break;
 }
 
+/* All new allocated mbuf's max data len is the same */
+max_data_len = temp->buf_len - temp->data_off;
+
+/* Calculate # of output mbufs. */
+nb_segs = size / max_data_len;
+if (size % max_data_len)
+nb_segs = nb_segs + 1;
+
+/* Allocate additional mbufs when multiple output mbufs required. */
+for (j = 1; j < nb_segs; j++) {
+temp->next = rte_pktmbuf_alloc(dev->mp);
+if (!temp->next) {
+rte_pktmbuf_free(pkts[txcnt]);
+pkts[txcnt] = NULL;
+break;
+}
+temp = temp->next;
+}
 /* We have to do a copy for now */
-memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
-   dp_packet_data(packet), size);
+rte_pktmbuf_pkt_len(pkts[txcnt]) = size;
+temp = pkts[txcnt];
+
+data_len = size < max_data_len ? size: max_data_len;
+if (packet->source == DPBUF_DPDK) {
+head = &(packet->mbuf);
+while (temp && head && size > 0) {
+rte_memcpy(rte_pktmbuf_mtod(temp, void*),
+dp_packet_data((struct dp_packet *)head), data_len);
+rte_pktmbuf_data_len(temp) = data_len;
+head = head->next;
+size = size - data_len;
+data_len =  size < max_data_len ? size: max_data_len;
+temp = temp->next;
+}
+} else {
+int offset = 0;
+while (temp && size > 0) {
+memcpy(rte_pktmbuf_mtod(temp, void *),
+dp_packet_at(packet, offset, data_len), data_len);
+rte_pktmbuf_data_len(temp) = data_len;
+temp = temp->next;
+size = size - data_len;
+offset += data_len;
+data_len = size < max_data_len ? size: max_data_len;
+}
+}
+
 dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);
 pkts[txcnt]->nb_segs = packet->mbuf.nb_segs;
 pkts[txcnt]->ol_flags = packet->mbuf.ol_flags;
-- 
1.9.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC PATCH v3 6/8] lib/dp-packet: copy data from multi-seg. DPDK mbuf

2017-11-21 Thread Mark Kavanagh
From: Michael Qiu 

When doing packet clone, if packet source is from DPDK driver,
multi-segment must be considered, and copy the segment's
data one by one.

Signed-off-by: Michael Qiu 
Signed-off-by: Mark Kavanagh 
---
 lib/dp-packet.c | 24 ++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/lib/dp-packet.c b/lib/dp-packet.c
index 5c590e5..26fff02 100644
--- a/lib/dp-packet.c
+++ b/lib/dp-packet.c
@@ -166,10 +166,30 @@ struct dp_packet *
 dp_packet_clone_with_headroom(const struct dp_packet *buffer, size_t headroom)
 {
 struct dp_packet *new_buffer;
+uint32_t pkt_len = dp_packet_size(buffer);
 
+#ifdef DPDK_NETDEV
+/* copy multi-seg data */
+if (buffer->source == DPBUF_DPDK && buffer->mbuf.nb_segs > 1) {
+uint32_t offset = 0;
+void *dst = NULL;
+struct rte_mbuf *tmbuf = CONST_CAST(struct rte_mbuf *, 
&(buffer->mbuf));
+
+new_buffer = dp_packet_new_with_headroom(pkt_len, headroom);
+dp_packet_set_size(new_buffer, pkt_len + headroom);
+dst = dp_packet_tail(new_buffer);
+
+while (tmbuf) {
+rte_memcpy((char *)dst + offset,
+   rte_pktmbuf_mtod(tmbuf, void *), tmbuf->data_len);
+offset += tmbuf->data_len;
+tmbuf = tmbuf->next;
+}
+}
+else
+#endif
 new_buffer = dp_packet_clone_data_with_headroom(dp_packet_data(buffer),
- dp_packet_size(buffer),
- headroom);
+ pkt_len, headroom);
 /* Copy the following fields into the returned buffer: l2_pad_size,
  * l2_5_ofs, l3_ofs, l4_ofs, cutlen, packet_type and md. */
 memcpy(_buffer->l2_pad_size, >l2_pad_size,
-- 
1.9.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC PATCH v3 5/8] lib/dp-packet: fix dp_packet_put_uninit for multi-seg mbufs

2017-11-21 Thread Mark Kavanagh
dp_packet_put_uninit(dp_packet, size) appends 'size' bytes to the tail
of a dp_packet. In the case of multi-segment mbufs, it is the data
length of the last mbuf in the mbuf chain that should be adjusted by
'size' bytes.

In its current implementation, dp_packet_put_uninit() adjusts the
dp_packet's size via a call to dp_packet_set_size(); however, this
adjusts the data length of the first mbuf in the chain, which is
incorrect in the case of multi-segment mbufs. Instead, traverse the
mbuf chain to locate the final mbuf of said chain, and update its
data_len [1]. To finish, increase the packet length of the entire
mbuf [2] by 'size'.

[1] In the case of a single-segment mbuf, this is the mbuf itself.
[2] This is stored in the first mbuf of an mbuf chain.

Signed-off-by: Mark Kavanagh 
---
 lib/dp-packet.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/lib/dp-packet.c b/lib/dp-packet.c
index 5078211..5c590e5 100644
--- a/lib/dp-packet.c
+++ b/lib/dp-packet.c
@@ -325,6 +325,22 @@ dp_packet_put_uninit(struct dp_packet *b, size_t size)
 void *p;
 dp_packet_prealloc_tailroom(b, size);
 p = dp_packet_tail(b);
+#ifdef DPDK_NETDEV
+if (b->source == DPBUF_DPDK)  {
+struct rte_mbuf *buf = &(b->mbuf);
+/* In the case of multi-segment mbufs, the data length of the last mbuf
+ * should be adjusted by 'size' bytes. A call to dp_packet_size() would
+ * adjust the data length of the first mbuf in the segment, so we avoid
+ * invoking same; as a result, the packet length of the entire mbuf
+ * chain (stored in the first mbuf of said chain) must be adjusted here
+ * instead.
+ */
+while (buf->next)
+buf = buf->next;
+buf->data_len += size;
+b->mbuf.pkt_len += size;
+} else
+#endif
 dp_packet_set_size(b, dp_packet_size(b) + size);
 return p;
 }
-- 
1.9.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC PATCH v3 4/8] lib/dp-packet: Fix data_len issue with multi-segs

2017-11-21 Thread Mark Kavanagh
From: Michael Qiu 

When a packet is from DPDK source, and it contains
multiple segments, data_len is not equal to the
packet size. This patch fixes this issue.

Co-authored-by: Mark Kavanagh 
Co-authored-by: Przemyslaw Lal 
Co-authored-by: Marcin Ksiadz 
Co-authored-by: Yuanhan Liu 

Signed-off-by: Michael Qiu 
Signed-off-by: Mark Kavanagh 
Signed-off-by: Przemyslaw Lal 
Signed-off-by: Marcin Ksiadz 
Signed-off-by: Yuanhan Liu 
---
 lib/dp-packet.h | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 7aa440f..c2736d3 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -23,6 +23,7 @@
 #ifdef DPDK_NETDEV
 #include 
 #include 
+#include "rte_ether.h"
 #endif
 
 #include "netdev-dpdk.h"
@@ -429,17 +430,14 @@ dp_packet_size(const struct dp_packet *b)
 static inline void
 dp_packet_set_size(struct dp_packet *b, uint32_t v)
 {
-/* netdev-dpdk does not currently support segmentation; consequently, for
- * all intents and purposes, 'data_len' (16 bit) and 'pkt_len' (32 bit) may
- * be used interchangably.
- *
- * On the datapath, it is expected that the size of packets
- * (and thus 'v') will always be <= UINT16_MAX; this means that there is no
- * loss of accuracy in assigning 'v' to 'data_len'.
+/*
+ * Assign current segment length. If total length is greater than
+ * max data length in a segment, additional calculation is needed
  */
-b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
-b->mbuf.pkt_len = v; /* Total length of all segments linked to
-  * this segment. */
+b->mbuf.data_len = MIN(v, b->mbuf.buf_len - b->mbuf.data_off);
+
+/* Total length of all segments linked to this segment. */
+b->mbuf.pkt_len = v;
 }
 
 static inline uint16_t
-- 
1.9.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC PATCH v3 3/8] lib/dp-packet: copy mbuf info for packet copy

2017-11-21 Thread Mark Kavanagh
From: Michael Qiu 

Currently, when doing packet copy, lots of DPDK mbuf's info
will be missed, like packet type, ol_flags, etc.
Those information is very important for DPDK to do
packets processing.

Signed-off-by: Michael Qiu 
[mark.b.kavan...@intel.com rebased]
Signed-off-by: Mark Kavanagh 
---
 lib/dp-packet.c   | 3 +++
 lib/netdev-dpdk.c | 4 
 2 files changed, 7 insertions(+)

diff --git a/lib/dp-packet.c b/lib/dp-packet.c
index 443c225..5078211 100644
--- a/lib/dp-packet.c
+++ b/lib/dp-packet.c
@@ -178,6 +178,9 @@ dp_packet_clone_with_headroom(const struct dp_packet 
*buffer, size_t headroom)
 
 #ifdef DPDK_NETDEV
 new_buffer->mbuf.ol_flags = buffer->mbuf.ol_flags;
+new_buffer->mbuf.tx_offload = buffer->mbuf.tx_offload;
+new_buffer->mbuf.packet_type = buffer->mbuf.packet_type;
+new_buffer->mbuf.nb_segs = buffer->mbuf.nb_segs;
 #else
 new_buffer->rss_hash_valid = buffer->rss_hash_valid;
 #endif
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index c5eb851..61a0dca 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -1860,6 +1860,10 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct 
dp_packet_batch *batch)
 memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
dp_packet_data(packet), size);
 dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);
+pkts[txcnt]->nb_segs = packet->mbuf.nb_segs;
+pkts[txcnt]->ol_flags = packet->mbuf.ol_flags;
+pkts[txcnt]->packet_type = packet->mbuf.packet_type;
+pkts[txcnt]->tx_offload = packet->mbuf.tx_offload;
 
 txcnt++;
 }
-- 
1.9.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [RFC PATCH v3 0/8] netdev-dpdk: support multi-segment mbufs

2017-11-21 Thread Mark Kavanagh
Overview

This patchset introduces support for multi-segment mbufs to OvS-DPDK.
Multi-segment mbufs are typically used when the size of an mbuf is
insufficient to contain the entirety of a packet's data. Instead, the
data is split across numerous mbufs, each carrying a portion, or
'segment', of the packet data. mbufs are chained via their 'next'
attribute (an mbuf pointer).

Use Cases
=
i.  Handling oversized (guest-originated) frames, which are marked
for hardware accelration/offload (TSO, for example).

Packets which originate from a non-DPDK source may be marked for
offload; as such, they may be larger than the permitted ingress
interface's MTU, and may be stored in an oversized dp-packet. In
order to transmit such packets over a DPDK port, their contents
must be copied to a DPDK mbuf (via dpdk_do_tx_copy). However, in
its current implementation, that function only copies data into
a single mbuf; if the space available in the mbuf is exhausted,
but not all packet data has been copied, then it is lost.
Similarly, when cloning a DPDK mbuf, it must be considered
whether that mbuf contains multiple segments. Both issues are
resolved within this patchset.

ii. Handling jumbo frames.

While OvS already supports jumbo frames, it does so by increasing
mbuf size, such that the entirety of a jumbo frame may be handled
in a single mbuf. This is certainly the preferred, and most
performant approach (and remains the default). However, it places
high demands on system memory; multi-segment mbufs may be
prefereable for systems which are memory-constrained.

Enabling multi-segment mbufs

Multi-segment and single-segment mbufs are mutually exclusive, and the
user must decide on which approach to adopt on init. The introduction
of a new OVSDB field, 'dpdk-multi-seg-mbufs', facilitates this.

This is a global boolean value, which determines how jumbo frames are
represented across all DPDK ports. In the absence of a user-supplied
value, 'dpdk-multi-seg-mbufs' defaults to false, i.e. multi-segment
mbufs must be explicitly enabled / single-segment mbufs remain the
default.

Setting the field is identical to setting existing DPDK-specific OVSDB
fields:

ovs-vsctl set Open_vSwitch . other_config:dpdk-init=true
ovs-vsctl set Open_vSwitch . other_config:dpdk-lcore-mask=0x10
ovs-vsctl set Open_vSwitch . other_config:dpdk-socket-mem=4096,0
==> ovs-vsctl set Open_vSwitch . other_config:dpdk-multi-seg-mbufs=true


Notes
=
This patchset was generated and built against:
- Current HEAD of OvS master [1] + DPDK v17.11 support patchset [2]
- DPDK v17.11

[1] 3728b3b ("Merge branch 'dpdk_merge' of https://github.com/ist...;)
[2] https://patchwork.ozlabs.org/series/13829/mbox/

Mark Kavanagh (4):
  netdev-dpdk: simplify mbuf sizing
  lib/dp-packet: init specific mbuf fields to 0
  lib/dp-packet: fix dp_packet_put_uninit for multi-seg mbufs
  netdev-dpdk: support multi-segment jumbo frames

Michael Qiu (4):
  lib/dp-packet: copy mbuf info for packet copy
  lib/dp-packet: Fix data_len issue with multi-segs
  lib/dp-packet: copy data from multi-seg. DPDK mbuf
  netdev-dpdk: copy large packet to multi-seg. mbufs

 NEWS |   1 +
 lib/dp-packet.c  |  43 +-
 lib/dp-packet.h  |  24 +-
 lib/dpdk.c   |   7 +++
 lib/netdev-dpdk.c| 122 ++-
 lib/netdev-dpdk.h|   1 +
 vswitchd/vswitch.xml |  20 +
 7 files changed, 183 insertions(+), 35 deletions(-)

-- 
1.9.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] datapath-windows: Account for VLAN tag in tunnel Decap

2017-11-21 Thread Anand Kumar
Acked-by: Anand Kumar 

Thanks,
Anand Kumar

On 11/20/17, 3:06 PM, "ovs-dev-boun...@openvswitch.org on behalf of Shashank 
Ram"  wrote:

Decap functions for tunneling protocols do not compute
the packet header offsets correctly when there is a VLAN
tag in the L2 header. This results in incorrect checksum
computation causing the packet to be dropped.

This patch adds support to account for the VLAN tag in the
packet if its present, and makes use of the OvsExtractLayers()
function to correctly compute the header offsets for different
layers.

Testing done:
- Tested Geneve, STT, Vxlan and Gre and verified that there
  are no regressions.
- Verified that packets with VLAN tags are correctly handled
  in the decap code of all tunneling protocols. Previously,
  this would result in packet drops due to invalid checksums
  being computed.
- Verified that non-VLAN tagged packets are handled correctly.

Signed-off-by: Shashank Ram 
---
 datapath-windows/ovsext/Geneve.c  | 14 +
 datapath-windows/ovsext/Geneve.h  |  6 ++
 datapath-windows/ovsext/Gre.c | 29 --
 datapath-windows/ovsext/Gre.h | 16 ++
 datapath-windows/ovsext/Offload.c | 10 +
 datapath-windows/ovsext/Offload.h |  3 ++-
 datapath-windows/ovsext/Stt.c | 44 
+++
 datapath-windows/ovsext/Stt.h |  6 ++
 datapath-windows/ovsext/Vxlan.c   | 14 +
 datapath-windows/ovsext/Vxlan.h   |  6 ++
 10 files changed, 111 insertions(+), 37 deletions(-)

diff --git a/datapath-windows/ovsext/Geneve.c 
b/datapath-windows/ovsext/Geneve.c
index 6dca69b..210716d 100644
--- a/datapath-windows/ovsext/Geneve.c
+++ b/datapath-windows/ovsext/Geneve.c
@@ -262,10 +262,16 @@ NDIS_STATUS OvsDecapGeneve(POVS_SWITCH_CONTEXT 
switchContext,
 PUINT8 bufferStart;
 PVOID optStart;
 NDIS_STATUS status;
+OVS_PACKET_HDR_INFO layers = { 0 };
+
+status = OvsExtractLayers(curNbl, );
+if (status != NDIS_STATUS_SUCCESS) {
+return status;
+}

 /* Check the length of the UDP payload */
 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
-tunnelSize = OvsGetGeneveTunHdrMinSize();
+tunnelSize = OvsGetGeneveTunHdrSizeFromLayers();
 packetLength = NET_BUFFER_DATA_LENGTH(curNb);
 if (packetLength <= tunnelSize) {
 return NDIS_STATUS_INVALID_LENGTH;
@@ -295,13 +301,13 @@ NDIS_STATUS OvsDecapGeneve(POVS_SWITCH_CONTEXT 
switchContext,

 ethHdr = (EthHdr *)bufferStart;
 /* XXX: Handle IP options. */
-ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
+ipHdr = (IPHdr *)(bufferStart + layers.l3Offset);
 tunKey->src = ipHdr->saddr;
 tunKey->dst = ipHdr->daddr;
 tunKey->tos = ipHdr->tos;
 tunKey->ttl = ipHdr->ttl;
 tunKey->pad = 0;
-udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
+udpHdr = (UDPHdr *)(bufferStart + layers.l4Offset);

 /* Validate if NIC has indicated checksum failure. */
 status = OvsValidateUDPChecksum(curNbl, udpHdr->check == 0);
@@ -312,7 +318,7 @@ NDIS_STATUS OvsDecapGeneve(POVS_SWITCH_CONTEXT 
switchContext,
 /* Calculate and verify UDP checksum if NIC didn't do it. */
 if (udpHdr->check != 0) {
 status = OvsCalculateUDPChecksum(curNbl, curNb, ipHdr, udpHdr,
- packetLength);
+ packetLength, );
 tunKey->flags |= OVS_TNL_F_CSUM;
 if (status != NDIS_STATUS_SUCCESS) {
 goto dropNbl;
diff --git a/datapath-windows/ovsext/Geneve.h 
b/datapath-windows/ovsext/Geneve.h
index 019c0dd..db758dd 100644
--- a/datapath-windows/ovsext/Geneve.h
+++ b/datapath-windows/ovsext/Geneve.h
@@ -113,6 +113,12 @@ OvsGetGeneveTunHdrMaxSize(VOID)
 return OvsGetGeneveTunHdrMinSize() + TUN_OPT_MAX_LEN;
 }

+static __inline UINT32
+OvsGetGeneveTunHdrSizeFromLayers(POVS_PACKET_HDR_INFO layers)
+{
+return layers->l7Offset + sizeof(GeneveHdr);
+}
+
 #define GENEVE_UDP_PORT 6081
 #define GENEVE_UDP_PORT_NBO 0xC117
 #define GENEVE_VER 0
diff --git a/datapath-windows/ovsext/Gre.c b/datapath-windows/ovsext/Gre.c
index f095742..1f38ee7 100644
--- a/datapath-windows/ovsext/Gre.c
+++ b/datapath-windows/ovsext/Gre.c
@@ -317,35 +317,42 @@ OvsDecapGre(POVS_SWITCH_CONTEXT switchContext,
 GREHdr *greHdr;
 UINT32 tunnelSize, packetLength;
 UINT32 headRoom = 0;
+UINT32 maxGreLen;
 PUINT8 bufferStart;
 NDIS_STATUS status = 

Re: [ovs-dev] [PATCH] dpif-netdev: Refactor datapath flow cache

2017-11-21 Thread Jan Scheurich
Hi Billy,

Thanks for your feedback.

I continue to have problems with sending out patches via mail from our company 
network. This time it seems it is the "Page Feed" (^L) character in the 
dpif-netdev.c file that got corrupted in the emailed version of the patch. I 
wish there was a way to upload patches other than through email.

Please try the attached patch file instead. This one applies cleanly on the tip 
of master for me.

Some more answers below.

BR, Jan

> > The EMC implementation is simplified by removing the possibility to store a 
> > flow
> > in two slots, as there is no particular reason why two flows should 
> > systematically
> > collide (the RSS hash is not symmetric).
> > The maximum size of the EMC flow key is limited to 256 bytes to reduce the
> > memory footprint. This should be sufficient to hold most real life packet 
> > flow
> > keys. Larger flows are not installed in the EMC.
> [[BO'M]] Does miniflow_extract work ok with the reduced miniflow size?
> Miniflow comment says:
>   Caller is responsible for initializing 'dst' with enough storage for 
> FLOW_U64S * 8 bytes.
> 

[Jan] miniflow_extract() is not used here. At EMC insertion I just copy the 
extracted miniflow into the EMC entry, provided that its length does not exceed 
the 248 available octets. If it is too large I simply don't insert the flow.

> > The pmd-stats-show command is enhanced to show both EMC and DFC hits
> > separately.
> >
> > The sweep speed for cleaning up obsolete EMC and DFC flow entries and 
> > freeing
> > dead megaflow entries is increased. With a typical PMD cycle duration of 
> > 100us
> > under load and checking one DFC entry per cycle, the DFC sweep should
> > normally complete within in 100s.
> >
> > In PVP performance tests with an L3 pipeline over VXLAN we determined the
> > optimal EMC size to be 16K entries to obtain a uniform speedup compared to
> > the master branch over the full range of parallel flows. The measurement 
> > below
> > is for 64 byte packets and the average number of subtable lookups per DPCLS 
> > hit
> > in this pipeline is 1.0, i.e. the acceleration already starts for a single 
> > busy mask.
> > Tests with many visited subtables should show a strong increase of the gain
> > through DFC.
> >
> > Flows   master  DFC+EMC  Gain
> > [Mpps]  [Mpps]
> > --
> > 8   4.454.62 3.8%
> > 100 4.174.47 7.2%
> > 10003.884.3412.0%
> > 20003.544.1717.8%
> > 50003.013.8227.0%
> > 1   2.753.6331.9%
> > 2   2.643.5032.8%
> > 5   2.603.3328.1%
> > 10  2.593.2324.7%
> > 50  2.593.1621.9%
> [[BO'M]]
> What is the flow distribution here?
> 
> Are there other flow distributions that we want to ensure do not suffer a 
> possible regression. I'm not sure what they are exactly - I have
> some ideas but admittedly I've only every tested with either a uniform flow 
> distribution or else a round-robin distribution.

[Jan] Since we are using DPDK Pktgen as traffic generator in our standard 
setup, this is a round-robin flow distribution, which from all I understand, is 
the worst-case for flow caching as the flow packets  are equidistant and there 
is no clustering.

I would be happy if people with other traffic generators can provide 
complementary measurements. 

> > +
> > +/* EMC lookup not successful: try DFC lookup. */
> > +struct dfc_entry *current_entry = dfc_entry_get(cache, key->hash);
> > +flow = current_entry->flow;
> 
>  [[BO'M]] There is no equivalent of netdev_flow_key_equal_mf() used in 
> emc_lookup here - which does a memcmp to verify the minflows
> are exactly equal. Without something similar for the DFC Isn't there a 
> possibility that the 20bit hashes of two packets will accidently point
> to the same flow struct?
> 

Yes, there is a risk for collision. The check whether the DFC entry points to 
the correct megaflow entry is done below. 

> >
> > -if (min && random_uint32() <= min) {
> > -emc_insert(>flow_cache, key, flow);
> > +if (dfc_entry_alive(current_entry) &&
> > +dpcls_rule_matches_key(>cr, key)) {

[Jan] If the pointed to megaflow is alive, we use the existing function 
dpcls_rule_matches_key() to do a masked comparison of the packet's flow key 
with the megaflow's rule. This function is used by the DPCLS classifier to 
check the packet against rules in the subtable matching the subtable hash and 
we can reuse it as is.

In case the match fails, we have a DFC miss. The subsequent DPCLS lookup will 
then overwrite the current DFC entry. I decided not to implement probabilistic 
DFC insertion for two reasons: a) the DFC is much larger so will happen at much 
higher levels, and b) updating the megaflow pointer in the DFC is far cheaper 
than EMC insertion.

___
dev mailing list
d...@openvswitch.org

Re: [ovs-dev] [PATCH 1/2] ovn-ctl: Add -vfile:info option to OVN_NB/SB_LOG options

2017-11-21 Thread Numan Siddique
Thank you!

On Nov 20, 2017 10:07 PM, "Ben Pfaff"  wrote:

> Yes, of course.  I didn't get much work done last week with ovscon
> going.
>
> On Mon, Nov 20, 2017 at 09:11:41PM +0530, Numan Siddique wrote:
> > A gentle ping. Ben, is it possible to have a look at this patch set -
> > https://patchwork.ozlabs.org/project/openvswitch/list/?series=12481.
> >
> > Thanks
> > Numan
> >
> >
> > On Wed, Nov 8, 2017 at 2:30 PM, Numan Siddique 
> wrote:
> >
> > >
> > >
> > > On Wed, Nov 8, 2017 at 9:36 AM, Numan Siddique 
> > > wrote:
> > >
> > >>
> > >>
> > >> On Wed, Nov 8, 2017 at 9:25 AM, Ben Pfaff  wrote:
> > >>
> > >>> On Wed, Nov 08, 2017 at 08:57:38AM +0530, Numan Siddique wrote:
> > >>> > On Wed, Nov 8, 2017 at 8:36 AM, Ben Pfaff  wrote:
> > >>> >
> > >>> > > On Tue, Nov 07, 2017 at 09:01:06PM +0530, nusid...@redhat.com
> wrote:
> > >>> > > > From: Numan Siddique 
> > >>> > > >
> > >>> > > > In the RHEL environment, when OVN db servers are started using
> > >>> ovn-ctl,
> > >>> > > > log files are empty. Adding "-vfile:info" option to
> ovsdb-server is
> > >>> > > > resolving this issue. Running 'ovs-apptctl -t .. vlog/reopen"
> > >>> results in
> > >>> > > the
> > >>> > > > logs appearing in the log files. This issue is seen with 2.7.2.
> > >>> > > >
> > >>> > > > "-vfile:info" option is passed to ovn-northd and ovn-controller
> > >>> when
> > >>> > > starting.
> > >>> > > > There is no harm in adding this to OVN db servers.
> > >>> > > >
> > >>> > > > Signed-off-by: Numan Siddique 
> > >>> > >
> > >>> > > This should be harmless, since "info" is the default log level,
> but I
> > >>> > > don't understand why it makes a difference.  Is there an
> underlying
> > >>> bug
> > >>> > > that should be fixed?  (Is the --log-file option being passed in?
> > >>> That
> > >>> > > is the documented way to enable logging to a file.)
> > >>> > >
> > >>> >
> > >>> > Yes. --log-file option is passed. I only see this issue with RHEL
> and
> > >>> not
> > >>> > with centos.
> > >>> > And if I restart the DB servers later, the log file gets generated
> > >>> > properly. It is seen
> > >>> > only for the first time durng tripleo deployment in RHEL
> environment. I
> > >>> > couldn't figure out the underlying
> > >>> > cause. Since "vlog:info" is passed for ovn-northd and
> ovn-controller, I
> > >>> > thought of taking this approach.
> > >>> > But I agree there seems to be underlying issue and this patch might
> > >>> hide
> > >>> > it.
> > >>>
> > >>> Have you spent a little time trying to track down the root cause?  If
> > >>> you have already done that, and still did not figure it out, then I'm
> > >>> happy to apply this patch.  Let me know.
> > >>>
> > >>
> > >> Thanks Ben. I did try. I am trying again with a fresh deployment to
> see
> > >> if I can find anything.
> > >> I will come back to you with the results.
> > >>
> > >>
> > > Hi Ben,
> > >
> > > I couldn't find the root cause.  I submitted v2 of the patch series.
> > >
> > > Thanks
> > > Numan
> > >
> > >
> > >> Thanks
> > >> Numan
> > >>
> > >>
> > >>
> > >
>
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] OVS DPDK: dpdk_merge pull request

2017-11-21 Thread Stokes, Ian
> On Fri, Nov 17, 2017 at 07:27:51PM +, Stokes, Ian wrote:
> > The following changes since commit
> 1ae83bb20677b42d63dbb2140fa8ed3144c6260f:
> >
> >   netdev-tc-offloads: Add support for action set (2017-11-16 08:10:29
> > -0800)
> >
> > are available in the git repository at:
> >
> >   https://github.com/istokes/ovs dpdk_merge
> >
> > for you to fetch changes up to af5b0dad30359d14d6412eb80e81783a83a678ce:
> >
> >   netdev-dpdk: Fix mempool creation with large MTU. (2017-11-17
> > 16:26:33 +)
> 
> Thanks a lot.  I merged this branch into master.

Thanks Ben.

> 
> This yields a new "sparse" warning:
> ../lib/netdev-tc-offloads.c:584:20: warning: Variable length array is
> used.
> because of this declaration in parse_put_flow_set_masked_action():
> char *set_buff[set_len], *set_data, *set_mask;
> 

So this was introduced in commit 1ae83bb20677b42d63dbb2140fa8ed3144c6260f: 
netdev-tc-offloads: Add support for action set. This was pushed just after I 
had submitted the pull request (but wasn't actually part of the pull request 
patches).

It's strange though, I ran sparse before submitting the pull request and it 
came back clean. Checking after the merge and I don't see the warning you see. 
(Normally I would not submit a pull request if a sparse warning occurred). What 
version of sparse are you using out of interest? I'm using sparse-0.5.0-10.
 
> It may or may not be worth fixing that.  In favor of fixing it is keeping
> OVS sparse warning free, but against it is that the replacement would
> probably be malloc(), which is slower.
> 
> However, looking a bit closer (now that I've already done the merge), I
> think there is a bug here.  set_buff[] is declared as char
> *set_buff[set_len], which has size (set_len * sizeof(char *)), but only
> set_len bytes of it are used.  I think that the right declaration would be
> more like uint8_t set_buff[set_len];, although that would lack the proper
> alignment for struct nl_attr.
> 
> Maybe something like this would be the appropriate fix for both issues.
> Will you please review it?

Sure, will be happy to review, but would also be good to get someone with 
tc-offload experience to sign off also.

Roi has also submitted a patch to fix the issue, maybe Roi could review below 
also?

https://mail.openvswitch.org/pipermail/ovs-dev/2017-November/34.html

> 
> Thanks,
> 
> Ben.
> 
> diff --git a/lib/netdev-tc-offloads.c b/lib/netdev-tc-offloads.c index
> 781d849e4a87..d6b61f6360d0 100644
> --- a/lib/netdev-tc-offloads.c
> +++ b/lib/netdev-tc-offloads.c
> @@ -581,7 +581,9 @@ parse_put_flow_set_masked_action(struct tc_flower
> *flower,
>   bool hasmask)  {
>  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> -char *set_buff[set_len], *set_data, *set_mask;
> +uint64_t set_stub[1024 / 8];
> +struct ofpbuf set_buf = OFPBUF_STUB_INITIALIZER(set_stub);
> +char *set_data, *set_mask;
>  char *key = (char *) >rewrite.key;
>  char *mask = (char *) >rewrite.mask;
>  const struct nlattr *attr;
> @@ -589,8 +591,7 @@ parse_put_flow_set_masked_action(struct tc_flower
> *flower,
>  size_t size;
> 
>  /* copy so we can set attr mask to 0 for used ovs key struct members
> */
> -memcpy(set_buff, set, set_len);
> -attr = (struct nlattr *) set_buff;
> +attr = ofpbuf_put(_buf, set, set_len);
> 
>  type = nl_attr_type(attr);
>  size = nl_attr_get_size(attr) / 2;
> @@ -600,6 +601,7 @@ parse_put_flow_set_masked_action(struct tc_flower
> *flower,
>  if (type >= ARRAY_SIZE(set_flower_map)
>  || !set_flower_map[type][0].size) {
>  VLOG_DBG_RL(, "unsupported set action type: %d", type);
> +ofpbuf_uninit(_buf);
>  return EOPNOTSUPP;
>  }
> 
> @@ -632,9 +634,11 @@ parse_put_flow_set_masked_action(struct tc_flower
> *flower,
>  if (hasmask && !is_all_zeros(set_mask, size)) {
>  VLOG_DBG_RL(, "unsupported sub attribute of set action type
> %d",
>  type);
> +ofpbuf_uninit(_buf);
>  return EOPNOTSUPP;
>  }
> 
> +ofpbuf_uninit(_buf);
>  return 0;
>  }
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 5/7] netdev-tc-offloads: Remove redundant loop handling ovs action set

2017-11-21 Thread Roi Dayan
From: Paul Blakey 

OVS action set always has a single nested OVS_KEY_ATTR_*
so there is no need to iterate it.

Signed-off-by: Paul Blakey 
Reviewed-by: Roi Dayan 
---
 lib/netdev-tc-offloads.c | 90 +++-
 1 file changed, 44 insertions(+), 46 deletions(-)

diff --git a/lib/netdev-tc-offloads.c b/lib/netdev-tc-offloads.c
index 6ad2551..1f77b55 100644
--- a/lib/netdev-tc-offloads.c
+++ b/lib/netdev-tc-offloads.c
@@ -644,56 +644,54 @@ static int
 parse_put_flow_set_action(struct tc_flower *flower, const struct nlattr *set,
   size_t set_len)
 {
-const struct nlattr *set_attr;
-size_t set_left;
-
-NL_ATTR_FOR_EACH_UNSAFE (set_attr, set_left, set, set_len) {
-if (nl_attr_type(set_attr) == OVS_KEY_ATTR_TUNNEL) {
-const struct nlattr *tunnel = nl_attr_get(set_attr);
-const size_t tunnel_len = nl_attr_get_size(set_attr);
-const struct nlattr *tun_attr;
-size_t tun_left;
-
-flower->set.set = true;
-NL_ATTR_FOR_EACH_UNSAFE (tun_attr, tun_left, tunnel, tunnel_len) {
-switch (nl_attr_type(tun_attr)) {
-case OVS_TUNNEL_KEY_ATTR_ID: {
-flower->set.id = nl_attr_get_be64(tun_attr);
-}
-break;
-case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: {
-flower->set.ipv4.ipv4_src = nl_attr_get_be32(tun_attr);
-}
-break;
-case OVS_TUNNEL_KEY_ATTR_IPV4_DST: {
-flower->set.ipv4.ipv4_dst = nl_attr_get_be32(tun_attr);
-}
-break;
-case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: {
-flower->set.ipv6.ipv6_src =
-nl_attr_get_in6_addr(tun_attr);
-}
-break;
-case OVS_TUNNEL_KEY_ATTR_IPV6_DST: {
-flower->set.ipv6.ipv6_dst =
-nl_attr_get_in6_addr(tun_attr);
-}
-break;
-case OVS_TUNNEL_KEY_ATTR_TP_SRC: {
-flower->set.tp_src = nl_attr_get_be16(tun_attr);
-}
-break;
-case OVS_TUNNEL_KEY_ATTR_TP_DST: {
-flower->set.tp_dst = nl_attr_get_be16(tun_attr);
-}
-break;
-}
-}
-} else {
+const struct nlattr *tunnel;
+const struct nlattr *tun_attr;
+size_t tun_left, tunnel_len;
+
+if (nl_attr_type(set) != OVS_KEY_ATTR_TUNNEL) {
 return parse_put_flow_set_masked_action(flower, set, set_len,
 false);
+}
+
+tunnel = nl_attr_get(set);
+tunnel_len = nl_attr_get_size(set);
+
+flower->set.set = true;
+NL_ATTR_FOR_EACH_UNSAFE(tun_attr, tun_left, tunnel, tunnel_len) {
+switch (nl_attr_type(tun_attr)) {
+case OVS_TUNNEL_KEY_ATTR_ID: {
+flower->set.id = nl_attr_get_be64(tun_attr);
+}
+break;
+case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: {
+flower->set.ipv4.ipv4_src = nl_attr_get_be32(tun_attr);
+}
+break;
+case OVS_TUNNEL_KEY_ATTR_IPV4_DST: {
+flower->set.ipv4.ipv4_dst = nl_attr_get_be32(tun_attr);
+}
+break;
+case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: {
+flower->set.ipv6.ipv6_src =
+nl_attr_get_in6_addr(tun_attr);
+}
+break;
+case OVS_TUNNEL_KEY_ATTR_IPV6_DST: {
+flower->set.ipv6.ipv6_dst =
+nl_attr_get_in6_addr(tun_attr);
+}
+break;
+case OVS_TUNNEL_KEY_ATTR_TP_SRC: {
+flower->set.tp_src = nl_attr_get_be16(tun_attr);
+}
+break;
+case OVS_TUNNEL_KEY_ATTR_TP_DST: {
+flower->set.tp_dst = nl_attr_get_be16(tun_attr);
+}
+break;
 }
 }
+
 return 0;
 }
 
-- 
2.7.5

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 6/7] tc: Fix wrong struct variable order

2017-11-21 Thread Roi Dayan
From: Paul Blakey 

Fix the struct variable order to corrospond with
it's usage.

Signed-off-by: Paul Blakey 
Reviewed-by: Roi Dayan 
---
 lib/tc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/tc.c b/lib/tc.c
index 4f82623..9170e3e 100644
--- a/lib/tc.c
+++ b/lib/tc.c
@@ -62,8 +62,8 @@ struct tc_pedit_key_ex {
 
 struct flower_key_to_pedit {
 enum pedit_header_type htype;
-int flower_offset;
 int offset;
+int flower_offset;
 int size;
 };
 
-- 
2.7.5

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 2/7] netdev-tc-offloads: Fix accidental skipping of extended pedit keys

2017-11-21 Thread Roi Dayan
From: Paul Blakey 

We only support extended pedit keys for now, so it's the type we
expect. Skip the legacy ones instead.

Signed-off-by: Paul Blakey 
Reviewed-by: Roi Dayan 
---
 lib/tc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/tc.c b/lib/tc.c
index 627de8a..401 100644
--- a/lib/tc.c
+++ b/lib/tc.c
@@ -484,7 +484,7 @@ nl_parse_act_pedit(struct nlattr *options, struct tc_flower 
*flower)
 break;
 }
 
-if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
+if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) {
 VLOG_ERR_RL(_rl, "unable to parse legacy pedit type: %d",
 nl_attr_type(nla));
 return EOPNOTSUPP;
-- 
2.7.5

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 7/7] tc: Send csum action only if we need to update csum

2017-11-21 Thread Roi Dayan
From: Paul Blakey 

Currently we send the tc csum action even if it's not needed.
Fix that by sending it only if csum update flags isn't zero.

Signed-off-by: Paul Blakey 
Reviewed-by: Roi Dayan 
---
 lib/tc.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/tc.c b/lib/tc.c
index 9170e3e..d3a0312 100644
--- a/lib/tc.c
+++ b/lib/tc.c
@@ -1371,9 +1371,11 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct 
tc_flower *flower)
 }
 nl_msg_end_nested(request, act_offset);
 
-act_offset = nl_msg_start_nested(request, act_index++);
-nl_msg_put_act_csum(request, flower->csum_update_flags);
-nl_msg_end_nested(request, act_offset);
+if (flower->csum_update_flags) {
+act_offset = nl_msg_start_nested(request, act_index++);
+nl_msg_put_act_csum(request, flower->csum_update_flags);
+nl_msg_end_nested(request, act_offset);
+}
 }
 if (flower->set.set) {
 act_offset = nl_msg_start_nested(request, act_index++);
-- 
2.7.5

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 4/7] netdev-tc-offloads: Remove redundant brackets

2017-11-21 Thread Roi Dayan
From: Paul Blakey 

Signed-off-by: Paul Blakey 
Reviewed-by: Roi Dayan 
---
 lib/tc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/tc.c b/lib/tc.c
index 360a770..4f82623 100644
--- a/lib/tc.c
+++ b/lib/tc.c
@@ -1059,7 +1059,7 @@ static void
 nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm,
  struct tc_pedit_key_ex *ex)
 {
-size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key));
+size_t ksize = sizeof *parm + parm->nkeys * sizeof(struct tc_pedit_key);
 size_t offset, offset_keys_ex, offset_key;
 int i;
 
-- 
2.7.5

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 3/7] netdev-tc-offloads: Verify csum flags on dump from tc

2017-11-21 Thread Roi Dayan
From: Paul Blakey 

On dump, parse and verify the tc csum action update flags
in the same way as we put them.

Signed-off-by: Paul Blakey 
Reviewed-by: Roi Dayan 
---
 lib/tc.c | 117 ++-
 lib/tc.h |   2 ++
 2 files changed, 103 insertions(+), 16 deletions(-)

diff --git a/lib/tc.c b/lib/tc.c
index 401..360a770 100644
--- a/lib/tc.c
+++ b/lib/tc.c
@@ -131,6 +131,10 @@ static struct flower_key_to_pedit flower_pedit_map[] = {
 },
 };
 
+static inline int
+csum_update_flag(struct tc_flower *flower,
+ enum pedit_header_type htype);
+
 struct tcmsg *
 tc_make_request(int ifindex, int type, unsigned int flags,
 struct ofpbuf *request)
@@ -465,7 +469,7 @@ nl_parse_act_pedit(struct nlattr *options, struct tc_flower 
*flower)
 char *rewrite_key = (void *) >rewrite.key;
 char *rewrite_mask = (void *) >rewrite.mask;
 size_t keys_ex_size, left;
-int type, i = 0;
+int type, i = 0, err;
 
 if (!nl_parse_nested(options, pedit_policy, pe_attrs,
  ARRAY_SIZE(pedit_policy))) {
@@ -493,6 +497,11 @@ nl_parse_act_pedit(struct nlattr *options, struct 
tc_flower *flower)
 ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
 type = nl_attr_get_u16(ex_type);
 
+err = csum_update_flag(flower, type);
+if (err) {
+return err;
+}
+
 for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
 struct flower_key_to_pedit *m = _pedit_map[j];
 int flower_off = m->flower_offset;
@@ -736,6 +745,46 @@ nl_parse_act_vlan(struct nlattr *options, struct tc_flower 
*flower)
 return 0;
 }
 
+static const struct nl_policy csum_policy[] = {
+[TCA_CSUM_PARMS] = { .type = NL_A_UNSPEC,
+ .min_len = sizeof(struct tc_csum),
+ .optional = false, },
+};
+
+static int
+nl_parse_act_csum(struct nlattr *options, struct tc_flower *flower)
+{
+struct nlattr *csum_attrs[ARRAY_SIZE(csum_policy)];
+const struct tc_csum *c;
+const struct nlattr *csum_parms;
+
+if (!nl_parse_nested(options, csum_policy, csum_attrs,
+ ARRAY_SIZE(csum_policy))) {
+VLOG_ERR_RL(_rl, "failed to parse csum action options");
+return EPROTO;
+}
+
+csum_parms = csum_attrs[TCA_CSUM_PARMS];
+c = nl_attr_get_unspec(csum_parms, sizeof *c);
+
+/* sanity checks */
+if (c->update_flags != flower->csum_update_flags) {
+VLOG_WARN_RL(_rl,
+ "expected different act csum flags: 0x%x != 0x%x",
+ flower->csum_update_flags, c->update_flags);
+return EINVAL;
+}
+flower->csum_update_flags = 0; /* so we know csum was handled */
+
+if (flower->needs_full_ip_proto_mask
+&& flower->mask.ip_proto != UINT8_MAX) {
+VLOG_WARN_RL(_rl, "expected full matching on flower ip_proto");
+return EINVAL;
+}
+
+return 0;
+}
+
 static const struct nl_policy act_policy[] = {
 [TCA_ACT_KIND] = { .type = NL_A_STRING, .optional = false, },
 [TCA_ACT_COOKIE] = { .type = NL_A_UNSPEC, .optional = true, },
@@ -782,8 +831,7 @@ nl_parse_single_action(struct nlattr *action, struct 
tc_flower *flower)
 } else if (!strcmp(act_kind, "pedit")) {
 nl_parse_act_pedit(act_options, flower);
 } else if (!strcmp(act_kind, "csum")) {
-/* not doing anything for now, ovs has an implicit csum recalculation
- * with rewriting of packet headers (translating of pedit acts). */
+nl_parse_act_csum(act_options, flower);
 } else {
 VLOG_ERR_RL(_rl, "unknown tc action kind: %s", act_kind);
 return EINVAL;
@@ -840,6 +888,13 @@ nl_parse_flower_actions(struct nlattr **attrs, struct 
tc_flower *flower)
 }
 }
 
+if (flower->csum_update_flags) {
+VLOG_WARN_RL(_rl,
+ "expected act csum with flags: 0x%x",
+ flower->csum_update_flags);
+return EINVAL;
+}
+
 return 0;
 }
 
@@ -1181,28 +1236,49 @@ calc_offsets(struct tc_flower *flower, struct 
flower_key_to_pedit *m,
 *mask = (void *) (rewrite_mask + m->flower_offset - diff);
 }
 
-static inline void
+static inline int
 csum_update_flag(struct tc_flower *flower,
  enum pedit_header_type htype) {
-if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) {
+/* Explictily specifiy the csum flags so HW can return EOPNOTSUPP
+ * if it doesn't support a checksum recalculation of some headers.
+ * And since OVS allows a flow such as
+ * eth(dst=),eth_type(0x0800) actions=set(ipv4(src=))
+ * we need to force a more specific flow as this can, for example,
+ * need a recalculation of icmp checksum if the packet that passes
+ * is icmp and tcp checksum if its tcp. */
+
+switch (htype) {
+case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:

[ovs-dev] [PATCH 1/7] netdev-tc-offloads: Fix travis compilation error

2017-11-21 Thread Roi Dayan
From: Paul Blakey 

Travis complains about variable length array usage, use
a fixed size instead.

Signed-off-by: Paul Blakey 
Reviewed-by: Roi Dayan 
---
 lib/netdev-tc-offloads.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/netdev-tc-offloads.c b/lib/netdev-tc-offloads.c
index 781d849..6ad2551 100644
--- a/lib/netdev-tc-offloads.c
+++ b/lib/netdev-tc-offloads.c
@@ -581,13 +581,15 @@ parse_put_flow_set_masked_action(struct tc_flower *flower,
  bool hasmask)
 {
 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
-char *set_buff[set_len], *set_data, *set_mask;
+char *set_buff[128], *set_data, *set_mask;
 char *key = (char *) >rewrite.key;
 char *mask = (char *) >rewrite.mask;
 const struct nlattr *attr;
 int i, j, type;
 size_t size;
 
+ovs_assert(set_len <= 128);
+
 /* copy so we can set attr mask to 0 for used ovs key struct members  */
 memcpy(set_buff, set, set_len);
 attr = (struct nlattr *) set_buff;
-- 
2.7.5

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 0/7] Fixes for header rewrite feature

2017-11-21 Thread Roi Dayan
Hi,

The following series fixes some issues with header rewrite
and some small refactoring.

Thanks,
Roi


Paul Blakey (7):
  netdev-tc-offloads: Fix travis compilation error
  netdev-tc-offloads: Fix accidental skipping of extended pedit keys
  netdev-tc-offloads: Verify csum flags on dump from tc
  netdev-tc-offloads: Remove redundant brackets
  netdev-tc-offloads: Remove redundant loop handling ovs action set
  tc: Fix wrong struct variable order
  tc: Send csum action only if we need to update csum

 lib/netdev-tc-offloads.c |  94 +-
 lib/tc.c | 131 +++
 lib/tc.h |   2 +
 3 files changed, 158 insertions(+), 69 deletions(-)

-- 
2.7.5

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] dpif-netdev: Refactor datapath flow cache

2017-11-21 Thread O Mahony, Billy
Hi Jan,

Thanks, that's a really interesting patch.

Currently does not apply to head of master - what rev can I apply it to?

Some more below - including one way down in the code.

Thanks,
/Billy.

> -Original Message-
> From: ovs-dev-boun...@openvswitch.org [mailto:ovs-dev-
> boun...@openvswitch.org] On Behalf Of Jan Scheurich
> Sent: Monday, November 20, 2017 5:33 PM
> To: d...@openvswitch.org
> Subject: [ovs-dev] [PATCH] dpif-netdev: Refactor datapath flow cache
> 
> So far the netdev datapath uses an 8K EMC to speed up the lookup of frequently
> used flows by comparing the parsed packet headers against the miniflow of a
> cached flow, using 13 bits of the packet RSS hash as index. The EMC is too 
> small
> for many applications with 100K or more parallel packet flows so that EMC
> threshing actually degrades performance.
> Furthermore, the size of struct miniflow and the flow copying cost prevents us
> from making it much larger.
> 
> At the same time the lookup cost of the megaflow classifier (DPCLS) is 
> increasing
> as the number of frequently hit subtables grows with the complexity of 
> pipeline
> and the number of recirculations.
> 
> To close the performance gap for many parallel flows, this patch introduces 
> the
> datapath flow cache (DFC) with 1M entries as lookup stage between EMC and
> DPCLS. It directly maps 20 bits of the RSS hash to a pointer to the last hit
> megaflow entry and performs a masked comparison of the packet flow with the
> megaflow key to confirm the hit. This avoids the costly DPCLS lookup even for
> very large number of parallel flows with a small memory overhead.
> 
> Due the large size of the DFC and the low risk of DFC thrashing, any DPCLS hit
> immediately inserts an entry in the DFC so that subsequent packets get speeded
> up. The DFC, thus, accelerate also short-lived flows.
> 
> To further accelerate the lookup of few elephant flows, every DFC hit 
> triggers a
> probabilistic EMC insertion of the flow. As the DFC entry is already in place 
> the
> default EMC insertion probability can be reduced to
> 1/1000 to minimize EMC thrashing should there still be many fat flows.
> The inverse EMC insertion probability remains configurable.
> 
> The EMC implementation is simplified by removing the possibility to store a 
> flow
> in two slots, as there is no particular reason why two flows should 
> systematically
> collide (the RSS hash is not symmetric).
> The maximum size of the EMC flow key is limited to 256 bytes to reduce the
> memory footprint. This should be sufficient to hold most real life packet flow
> keys. Larger flows are not installed in the EMC.
[[BO'M]] Does miniflow_extract work ok with the reduced miniflow size?
Miniflow comment says: 
  Caller is responsible for initializing 'dst' with enough storage for 
FLOW_U64S * 8 bytes.

> 
> The pmd-stats-show command is enhanced to show both EMC and DFC hits
> separately.
> 
> The sweep speed for cleaning up obsolete EMC and DFC flow entries and freeing
> dead megaflow entries is increased. With a typical PMD cycle duration of 100us
> under load and checking one DFC entry per cycle, the DFC sweep should
> normally complete within in 100s.
> 
> In PVP performance tests with an L3 pipeline over VXLAN we determined the
> optimal EMC size to be 16K entries to obtain a uniform speedup compared to
> the master branch over the full range of parallel flows. The measurement below
> is for 64 byte packets and the average number of subtable lookups per DPCLS 
> hit
> in this pipeline is 1.0, i.e. the acceleration already starts for a single 
> busy mask.
> Tests with many visited subtables should show a strong increase of the gain
> through DFC.
> 
> Flows   master  DFC+EMC  Gain
> [Mpps]  [Mpps]
> --
> 8   4.454.62 3.8%
> 100 4.174.47 7.2%
> 10003.884.3412.0%
> 20003.544.1717.8%
> 50003.013.8227.0%
> 1   2.753.6331.9%
> 2   2.643.5032.8%
> 5   2.603.3328.1%
> 10  2.593.2324.7%
> 50  2.593.1621.9%
[[BO'M]]
What is the flow distribution here? 

Are there other flow distributions that we want to ensure do not suffer a 
possible regression. I'm not sure what they are exactly - I have some ideas but 
admittedly I've only every tested with either a uniform flow distribution or 
else a round-robin distribution. 

> 
> 
> Signed-off-by: Jan Scheurich 
> ---
>  lib/dpif-netdev.c | 349 ---
> ---
>  1 file changed, 235 insertions(+), 114 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index db78318..efcf2e9 
> 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -127,19 +127,19 @@ struct netdev_flow_key {
>  uint64_t buf[FLOW_MAX_PACKET_U64S];  };
> 
> -/* Exact match cache for frequently used flows
> +/* Datapath flow cache (DFC) for frequently used flows
>  

Re: [ovs-dev] [PATCH v2] odp-execute: Skip processing actions when batch is emptied

2017-11-21 Thread Vishal Deep Ajmera
Do anyone sees any issue with this change ? Though it does not address any 
existing bug, but will make the code less error prone. 

Regards,
Vishal

-Original Message-
From: ovs-dev-boun...@openvswitch.org [mailto:ovs-dev-boun...@openvswitch.org] 
On Behalf Of Vishal Deep Ajmera
Sent: Thursday, November 16, 2017 11:32 AM
To: d...@openvswitch.org
Subject: [ovs-dev] [PATCH v2] odp-execute: Skip processing actions when batch 
is emptied

Today in OVS, when errors are encountered during the execution of an action the 
entire batch of packets may be deleted (for e.g. in processing push_tnl_action, 
if the port is not found in the port_cache of PMD). The remaining actions 
continue to be executed even though there are no packets to be processed.
It is assumed that the code dealing with each action checks that the batch is 
not empty before executing. Crashes may occur if the assumption is not met.

The patch makes OVS skip processing of further actions from the action-set once 
a batch is emptied. Doing so centralizes the check in one place and avoids the 
possibility of crashes.

Signed-off-by: Vishal Deep Ajmera 
---
lib/odp-execute.c | 9 ++---
1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/odp-execute.c b/lib/odp-execute.c index 3011479..887246d 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -686,9 +686,12 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
*batch, bool steal,

 dp_execute_action(dp, batch, a, may_steal);

-if (last_action) {
-/* We do not need to free the packets. dp_execute_actions()
- * has stolen them */
+if (last_action || (batch->count == 0)) {
+/* We do not need to free the packets.
+ * Either dp_execute_actions() has stolen them
+ * or the batch is freed due to errors. In either
+ * case we do not need to execute further actions.
+ */
 return;
 }
 }
--
1.9.1
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH V2 3/4] tc: Add header rewrite using tc pedit action

2017-11-21 Thread Simon Horman
On Tue, Nov 21, 2017 at 07:50:24AM +0200, Roi Dayan wrote:
> 
> 
> On 20/11/2017 14:30, Simon Horman wrote:
> > On Sun, Nov 19, 2017 at 08:45:19AM +0200, Roi Dayan wrote:
> > > 
> > > 
> > > On 16/11/2017 18:29, Simon Horman wrote:
> > > > On Wed, Oct 25, 2017 at 02:24:15PM +0300, Roi Dayan wrote:
> > > > > 
> > > > > 
> > > > > On 27/09/2017 12:08, Simon Horman wrote:
> > > > > > On Mon, Sep 25, 2017 at 04:31:42PM +0300, Paul Blakey wrote:
> > > > > > > 
> > > > > > > 
> > > > > > > On 18/09/2017 18:01, Simon Horman wrote:
> > > > > > > > On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote:
> > > > > > > > > From: Paul Blakey 
> > > > > > > > > 
> > > > > > > > > To be later used to implement ovs action set offloading.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Paul Blakey 
> > > > > > > > > Reviewed-by: Roi Dayan 
> > > > > > > > > ---
> > > > > > > > > lib/tc.c | 372 
> > > > > > > > > ++-
> > > > > > > > > lib/tc.h |  16 +++
> > > > > > > > > 2 files changed, 385 insertions(+), 3 deletions(-)
> > > > > > > > > 
> > > > > > > > > diff --git a/lib/tc.c b/lib/tc.c
> > > > > > > > > index c9cada2..743b2ee 100644
> > > > > > > > > --- a/lib/tc.c
> > > > > > > > > +++ b/lib/tc.c
> > > > > > > > > @@ -21,8 +21,10 @@
> > > > > > > > > #include 
> > > > > > > > > #include 
> > > > > > > > > #include 
> > > > > > > > > +#include 
> > > > > > > > > #include 
> > > > > > > > > #include 
> > > > > > > > > +#include 
> > > > > > > > > #include 
> > > > > > > > > #include 
> > > > > > > > > #include 
> > > > > > > > > @@ -33,11 +35,14 @@
> > > > > > > > > #include "netlink-socket.h"
> > > > > > > > > #include "netlink.h"
> > > > > > > > > #include "openvswitch/ofpbuf.h"
> > > > > > > > > +#include "openvswitch/util.h"
> > > > > > > > > #include "openvswitch/vlog.h"
> > > > > > > > > #include "packets.h"
> > > > > > > > > #include "timeval.h"
> > > > > > > > > #include "unaligned.h"
> > > > > > > > > +#define MAX_PEDIT_OFFSETS 8
> > > > > > > > 
> > > > > > > > Why 8?
> > > > > > > We don't expect anything more right now (ipv6 src/dst rewrite 
> > > > > > > requires 8
> > > > > > > pedits iirc). I can't think of a larger use case, maybe ipv6 + 
> > > > > > > macs if
> > > > > > > that's makes sens. do you suggest we increase it? to what?
> > > > > > 
> > > > > > It seems strange to me to place a somewhat arbitrary small limit
> > > > > > when none exists in the pedit API being used. I would at prefer if
> > > > > > it was at least a bigger, say 16 or 32.
> > > > > 
> > > > 
> > > > > Hi Simon,
> > > > > 
> > > > > Sorry for the late reply due to holidays and vacations.
> > > > > Me & Paul going to go over this and do the fixes needed and
> > > > > also rebase over latest master and run tests again.
> > > > > 
> > > > > I'll answer what I'm more familiar with now and Paul will continue.
> > > > > The 8 here is too low and you right. We used this definition
> > > > > for allocation of the pedit keys on the stack in
> > > > > nl_msg_put_flower_rewrite_pedits()
> > > > > 
> > > > > It was for convenience instead of calculating the maximum possible
> > > > > keys that could exists and allocating it there and freeing it at
> > > > > the end.
> > > > > 
> > > > > Increasing it to 32 is probably more than enough and wont waste much.
> > > > 
> > > > I updated the value to 32 when applying the patch.
> > > > 
> > > > ...
> > > > 
> > > > > > > > If I understand the above correctly it is designed to make
> > > > > > > > pedit actions disjoint. If so, why is that necessary? >
> > > > > > > 
> > > > > > > It's not, as a single flower key rewrite can be split to multiple 
> > > > > > > pedit
> > > > > > > actions it finds the overlap between a flower key and a pedit 
> > > > > > > action, if
> > > > > > > they do overlap it translates it to the correct offset and masks 
> > > > > > > it out.
> > > > > > 
> > > > > > Thanks, understood.
> > > > > > 
> > > > > > > 
> > > > > > > > > +} else {
> > > > > > > > > +VLOG_ERR_RL(_rl, "unable to parse legacy 
> > > > > > > > > pedit type: %d",
> > > > > > > > > +nl_attr_type(nla));
> > > > > > > > > +return EOPNOTSUPP;
> > > > > > > > > +}
> > > > > > > > 
> > > > > > > > I think the code could exit early here as
> > > > > > > > nl_msg_put_flower_rewrite_pedits() does below.
> > > > > > > > 
> > > > > > > 
> > > > > > > Sorry, didn't understand. can you give an example?
> > > > > > > 
> > > > > > 
> > > > > > I meant something like this.
> > > > > > 
> > > > > >if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) {
> > > > > >VLOG_ERR_RL(...);
> > > > > >return EOPNOTSUPP;
> > > > > >}
> > > > > 
> > > > > understood. we'll do that. thanks.
> > > > 
> > > > I 

Re: [ovs-dev] [PATCH v2 0/3]: dpif-netdev: Detailed PMD performance metrics and supervision

2017-11-21 Thread Jan Scheurich
I submitted a rebased version of this series yesterday.
Please look at the new v3 series instead.
/Jan

> -Original Message-
> From: Jan Scheurich
> Sent: Wednesday, 18 October, 2017 12:46
> To: 'ovs-dev@openvswitch.org' 
> Cc: Jan Scheurich 
> Subject: [PATCH v2 0/3]: dpif-netdev: Detailed PMD performance metrics and 
> supervision
> 
> Friendly reminder to test and/or review this patch series.
> 
> The run-time performance of PMDs is often difficult to understand and 
> trouble-shoot. The existing PMD statistics counters only provide a
> coarse grained average picture. At packet rates of several Mpps sporadic 
> drops of packet bursts happen at sub-millisecond time scales
> and are impossible to capture and analyze with existing tools.
> 
> This patch set refactors the existing PMD statistics into a dedicated 
> submodule and collects a large number of important PMD
> performance metrics per PMD iteration, maintaining histograms and circular 
> histories for iteration metrics and millisecond averages. To
> capture sporadic drop events, the patch set can be configured to monitor 
> iterations for suspicious metrics and to log the neighborhood of
> such iterations for off-line analysis.
> 
> The extra cost for the performance metric collection and the supervision has 
> been measured to be in the order of 1% compared to the
> base commit in a phy-to-phy setup with VXLAN tunnels (two datapath passes per 
> packet). We believe this is fast enough to not warrant a
> build- or run-time configuration option to disable this.
> 
> The first patch in the series fully includes the changes proposed in Darrel's 
> earlier "[patch_v5 0/3] dpif-netdev: Fix and refactor pmd stats"
> (https://mail.openvswitch.org/pipermail/ovs-dev/2017-August/337686.html).
> 
> Jan Scheurich (3)
>   Patch 1/3: Refactor PMD performance into dpif-netdev-perf
>   Patch 2/3: Detailed performance stats for PMDs
>   Patch 3/3: Detection and logging of suspicious PMD iterations
> 
> v1 -> v2:
>   Rebased to OVS master (commit 7468ec788)
>   No other changes compared to v1
> 
> lib/automake.mk|   2 +
>  lib/dp-packet.h|   2 +
>  lib/dpif-netdev-perf.c | 512 
> +
>  lib/dpif-netdev-perf.h | 317 ++
>  lib/dpif-netdev.c  | 438 +++---
>  lib/netdev-dpdk.c  |  28 ++-
>  lib/netdev-dpdk.h  |  14 ++
>  ofproto/ofproto-dpif.c |   3 +-
>  tests/pmd.at   |  22 ++-
>  9 files changed, 1132 insertions(+), 206 deletions(-)

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev